claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
1,083 lines • 60 kB
JavaScript
/**
* Worker Daemon Service
* Node.js-based background worker system that auto-runs like shell daemons
*
* Workers:
* - map: Codebase mapping (5 min interval)
* - audit: Security analysis (10 min interval)
* - optimize: Performance optimization (15 min interval)
* - consolidate: Memory consolidation (30 min interval)
* - testgaps: Test coverage analysis (20 min interval)
*/
import { EventEmitter } from 'events';
import { existsSync, mkdirSync, writeFileSync, readFileSync, appendFileSync, unlinkSync, renameSync } from 'fs';
import { cpus } from 'os';
import { join } from 'path';
import { HeadlessWorkerExecutor, isHeadlessWorker, } from './headless-worker-executor.js';
// Default worker configurations with improved intervals (P0 fix: map 5min -> 15min)
const DEFAULT_WORKERS = [
{ type: 'map', intervalMs: 15 * 60 * 1000, offsetMs: 0, priority: 'normal', description: 'Codebase mapping', enabled: true },
{ type: 'audit', intervalMs: 10 * 60 * 1000, offsetMs: 2 * 60 * 1000, priority: 'critical', description: 'Security analysis', enabled: true },
{ type: 'optimize', intervalMs: 15 * 60 * 1000, offsetMs: 4 * 60 * 1000, priority: 'high', description: 'Performance optimization', enabled: true },
{ type: 'consolidate', intervalMs: 30 * 60 * 1000, offsetMs: 6 * 60 * 1000, priority: 'low', description: 'Memory consolidation', enabled: true },
{ type: 'testgaps', intervalMs: 20 * 60 * 1000, offsetMs: 8 * 60 * 1000, priority: 'normal', description: 'Test coverage analysis', enabled: true },
{ type: 'predict', intervalMs: 10 * 60 * 1000, offsetMs: 0, priority: 'low', description: 'Predictive preloading', enabled: false },
{ type: 'document', intervalMs: 60 * 60 * 1000, offsetMs: 0, priority: 'low', description: 'Auto-documentation', enabled: false },
];
// Worker timeout — must exceed the longest per-worker headless timeout (15 min for audit/refactor).
// Previously 5 min, which caused orphan processes when daemon timeout fired before executor timeout (#1117).
const DEFAULT_WORKER_TIMEOUT_MS = 16 * 60 * 1000;
/**
* Worker Daemon - Manages background workers with Node.js
*/
export class WorkerDaemon extends EventEmitter {
config;
workers = new Map();
timers = new Map();
// #1845: separate timer for the MCP-dispatch queue poller. Kept off
// the per-worker map so stop() clears both kinds without confusion.
queuePollTimer;
running = false;
startedAt;
projectRoot;
runningWorkers = new Set(); // Track concurrent workers
pendingWorkers = []; // Queue for deferred workers
// Headless execution support
headlessExecutor = null;
headlessAvailable = false;
// Preserve the original constructor config so we can detect explicit overrides
// during state restoration (R1: constructor config takes priority over stale state)
originalConfig;
constructor(projectRoot, config) {
super();
this.projectRoot = projectRoot;
this.originalConfig = config;
const claudeFlowDir = join(projectRoot, '.claude-flow');
// Read daemon config from .claude-flow/config.json (Layer B)
const fileConfig = this.readDaemonConfigFromFile(claudeFlowDir);
// CPU-proportional smart default instead of hardcoded 2.0
const cpuCount = WorkerDaemon.getEffectiveCpuCount();
let smartMaxCpuLoad = Math.max(cpuCount * 0.8, 2.0); // Floor of 2.0 for single-CPU machines
// #2110 — WSL2 reports `/proc/loadavg` values that include Windows-side
// process counts mapped into the Linux kernel. Real load on a 4-CPU
// WSL2 host can be 200-400 even when the Linux side is idle. The
// default gate of `cpuCount * 0.8` always trips, deferring every
// worker as "CPU load too high" while the daemon reports healthy.
// Bump the floor to 1000 when WSL is detected so the gate is
// effectively disabled (real load on Linux side rarely exceeds 100
// even under heavy contention).
if (WorkerDaemon.isWslEnvironment()) {
smartMaxCpuLoad = Math.max(smartMaxCpuLoad, 1000);
}
// Platform-aware default: macOS os.freemem() excludes reclaimable file cache,
// so reported "free" is much lower than actually available memory.
// Linux reports available memory (including reclaimable cache) more accurately.
const defaultMinFreeMemory = process.platform === 'darwin' ? 5 : 10;
// Priority: constructor arg > config.json > smart default
// For resourceThresholds, merge field-by-field so partial overrides
// (e.g. only --max-cpu-load) still pick up defaults for other fields.
this.config = {
autoStart: config?.autoStart ?? fileConfig.autoStart ?? false,
logDir: config?.logDir ?? join(claudeFlowDir, 'logs'),
stateFile: config?.stateFile ?? join(claudeFlowDir, 'daemon-state.json'),
maxConcurrent: config?.maxConcurrent ?? fileConfig.maxConcurrent ?? 2,
workerTimeoutMs: config?.workerTimeoutMs ?? fileConfig.workerTimeoutMs ?? DEFAULT_WORKER_TIMEOUT_MS,
resourceThresholds: {
maxCpuLoad: config?.resourceThresholds?.maxCpuLoad ?? fileConfig.maxCpuLoad ?? smartMaxCpuLoad,
minFreeMemoryPercent: config?.resourceThresholds?.minFreeMemoryPercent ?? fileConfig.minFreeMemoryPercent ?? defaultMinFreeMemory,
},
workers: config?.workers ?? DEFAULT_WORKERS,
};
// Setup graceful shutdown handlers
this.setupShutdownHandlers();
// #1855: install crash handlers so uncaught exceptions and unhandled
// rejections don't leak the PID file or orphan child processes.
this.installCrashHandlers();
// Ensure directories exist
if (!existsSync(claudeFlowDir)) {
mkdirSync(claudeFlowDir, { recursive: true });
}
if (!existsSync(this.config.logDir)) {
mkdirSync(this.config.logDir, { recursive: true });
}
// Initialize worker states
this.initializeWorkerStates();
// Initialize headless executor (async, non-blocking)
this.initHeadlessExecutor().catch((err) => {
this.log('warn', `Headless executor init failed: ${err}`);
});
}
/**
* Initialize headless executor if Claude Code is available
*/
async initHeadlessExecutor() {
try {
this.headlessExecutor = new HeadlessWorkerExecutor(this.projectRoot, {
maxConcurrent: this.config.maxConcurrent,
});
this.headlessAvailable = await this.headlessExecutor.isAvailable();
if (this.headlessAvailable) {
this.log('info', 'Claude Code headless mode available - AI workers enabled');
// Forward headless executor events. #1855: also snapshot the
// active child PIDs to disk on every transition so the next
// lifetime can reap orphans after a hard crash.
this.headlessExecutor.on('execution:start', (data) => {
this.writeChildrenSnapshot();
this.emit('headless:start', data);
});
this.headlessExecutor.on('execution:complete', (data) => {
this.writeChildrenSnapshot();
this.emit('headless:complete', data);
});
this.headlessExecutor.on('execution:error', (data) => {
this.writeChildrenSnapshot();
this.emit('headless:error', data);
});
this.headlessExecutor.on('output', (data) => {
this.emit('headless:output', data);
});
}
else {
this.log('info', 'Claude Code not found - AI workers will run in local fallback mode');
}
}
catch (error) {
this.log('warn', `Failed to initialize headless executor: ${error}`);
this.headlessAvailable = false;
}
}
/**
* Check if headless execution is available
*/
isHeadlessAvailable() {
return this.headlessAvailable;
}
/**
* Get headless executor instance
*/
getHeadlessExecutor() {
return this.headlessExecutor;
}
/**
* Detect effective CPU count for the current environment.
*
* Inside Docker / K8s containers, os.cpus().length reports the HOST cpu
* count, not the container limit (Node.js #28762 — wontfix). We read
* cgroup v2 / v1 quota files first so the maxCpuLoad threshold stays
* meaningful under resource-limited containers.
*/
/**
* #2110 — detect WSL2 / WSL1 so the CPU-load gate can use a sane
* default. `/proc/loadavg` on WSL maps in Windows-side process counts
* and routinely reports values 100-1000x larger than real Linux load.
*
* Detection order:
* 1. `WSL_DISTRO_NAME` env var (set by Microsoft's WSL launcher)
* 2. `WSL_INTEROP` env var (set by recent WSL2)
* 3. `/proc/sys/kernel/osrelease` contains "microsoft" or "WSL"
* (kernel build marker; survives env stripping)
*/
static isWslEnvironment() {
if (process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP)
return true;
try {
const osrelease = readFileSync('/proc/sys/kernel/osrelease', 'utf8').toLowerCase();
if (osrelease.includes('microsoft') || osrelease.includes('wsl'))
return true;
}
catch { /* not on Linux or /proc inaccessible */ }
return false;
}
static getEffectiveCpuCount() {
// 1. Try cgroup v2: /sys/fs/cgroup/cpu.max
try {
const cpuMax = readFileSync('/sys/fs/cgroup/cpu.max', 'utf8').trim();
const [quotaStr, periodStr] = cpuMax.split(' ');
if (quotaStr !== 'max') {
const quota = parseInt(quotaStr, 10);
const period = parseInt(periodStr, 10);
if (quota > 0 && period > 0)
return Math.ceil(quota / period);
}
}
catch { /* not in cgroup v2 */ }
// 2. Try cgroup v1: /sys/fs/cgroup/cpu/cpu.cfs_quota_us
try {
const quota = parseInt(readFileSync('/sys/fs/cgroup/cpu/cpu.cfs_quota_us', 'utf8').trim(), 10);
const period = parseInt(readFileSync('/sys/fs/cgroup/cpu/cpu.cfs_period_us', 'utf8').trim(), 10);
if (quota > 0 && period > 0)
return Math.ceil(quota / period);
}
catch { /* not in cgroup v1 */ }
// 3. Fallback to os.cpus().length
return cpus().length || 1;
}
/**
* Read daemon-specific config from .claude-flow/config.{json,yaml,yml}.
* Supports dot-notation keys like 'daemon.resourceThresholds.maxCpuLoad'.
* #1844: prefer JSON when both exist (existing behavior) but fall back
* to YAML so operators using the v3 canonical YAML format aren't silently
* ignored. The chosen path is logged at info level.
*/
readDaemonConfigFromFile(claudeFlowDir) {
const jsonPath = join(claudeFlowDir, 'config.json');
const yamlPath = join(claudeFlowDir, 'config.yaml');
const ymlPath = join(claudeFlowDir, 'config.yml');
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let raw;
let chosenPath;
if (existsSync(jsonPath)) {
try {
raw = JSON.parse(readFileSync(jsonPath, 'utf-8'));
chosenPath = jsonPath;
}
catch {
return {};
}
}
else if (existsSync(yamlPath) || existsSync(ymlPath)) {
const yPath = existsSync(yamlPath) ? yamlPath : ymlPath;
try {
// Lazy-load yaml so the daemon doesn't hard-require it; if the
// dep isn't installed, fall back to the previous warn-only path.
// eslint-disable-next-line @typescript-eslint/no-var-requires
const yamlMod = require('yaml');
const parsed = yamlMod.parse(readFileSync(yPath, 'utf-8'));
if (parsed && typeof parsed === 'object') {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
raw = parsed;
chosenPath = yPath;
}
}
catch {
this.log('warn', `Found ${yPath} but yaml parser unavailable. Install \`yaml\` or convert to JSON. Falling back to defaults.`);
return {};
}
}
if (!raw || !chosenPath) {
return {};
}
this.log('info', `Daemon config loaded from ${chosenPath}`);
try {
// Support both flat keys at root and nested under scopes.project
const cfg = raw?.scopes?.project ?? raw;
const rawCpuLoad = cfg['daemon.resourceThresholds.maxCpuLoad'] ?? raw['daemon.resourceThresholds.maxCpuLoad'];
const rawMinMem = cfg['daemon.resourceThresholds.minFreeMemoryPercent'] ?? raw['daemon.resourceThresholds.minFreeMemoryPercent'];
const rawMaxConcurrent = cfg['daemon.maxConcurrent'] ?? raw['daemon.maxConcurrent'];
const rawTimeout = cfg['daemon.workerTimeoutMs'] ?? raw['daemon.workerTimeoutMs'];
return {
autoStart: typeof raw['daemon.autoStart'] === 'boolean' ? raw['daemon.autoStart'] : undefined,
maxConcurrent: (typeof rawMaxConcurrent === 'number' && rawMaxConcurrent > 0) ? rawMaxConcurrent : undefined,
workerTimeoutMs: (typeof rawTimeout === 'number' && rawTimeout > 0) ? rawTimeout : undefined,
maxCpuLoad: (typeof rawCpuLoad === 'number' && rawCpuLoad > 0 && rawCpuLoad < 1000) ? rawCpuLoad : undefined,
minFreeMemoryPercent: (typeof rawMinMem === 'number' && rawMinMem >= 0 && rawMinMem <= 100) ? rawMinMem : undefined,
};
}
catch {
return {};
}
}
/**
* Setup graceful shutdown handlers
*/
setupShutdownHandlers() {
const shutdown = async () => {
this.log('info', 'Received shutdown signal, stopping daemon...');
await this.stop();
process.exit(0);
};
process.on('SIGTERM', shutdown);
process.on('SIGINT', shutdown);
process.on('SIGHUP', shutdown);
}
/**
* #1855: install crash handlers for uncaught exceptions and unhandled
* rejections. Without these, a thrown error from any timer callback,
* worker logic path, or transitive import crashes the daemon process
* silently — the PID file leaks and any in-flight child processes
* orphan. With these, we log a structured crash record, run stop()
* to clean up, then exit 1 so the process actually dies (otherwise
* Node would crash anyway after the handler returns).
*/
installCrashHandlers() {
const onCrash = (kind, err) => {
// Best-effort logging; never throw from inside the crash handler.
try {
this.writeCrashRecord(kind, err);
}
catch { /* nothing more we can do */ }
try {
// Synchronous stop — don't await; the process is dying. Just
// remove the PID file and snapshot state so the next start
// sees a clean slate.
this.removePidFile();
this.saveState();
// Snapshot any in-flight child PIDs one last time so the next
// lifetime can reap them.
this.writeChildrenSnapshot();
}
catch { /* ignore */ }
// Exit non-zero so supervisors / shells see the failure.
process.exit(1);
};
process.on('uncaughtException', (err) => onCrash('uncaughtException', err));
process.on('unhandledRejection', (err) => onCrash('unhandledRejection', err));
}
/**
* Append a structured crash record to .claude-flow/logs/crash.log.
* Inspectable by hand or via `ruflo daemon status` follow-ups.
*/
writeCrashRecord(kind, err) {
const logDir = this.config.logDir;
if (!existsSync(logDir))
mkdirSync(logDir, { recursive: true });
const crashLog = join(logDir, 'crash.log');
const ts = new Date().toISOString();
const message = err instanceof Error ? err.message : String(err);
const stack = err instanceof Error && err.stack ? err.stack : '<no stack>';
const record = `[${ts}] [${kind}] pid=${process.pid} ${message}\n${stack}\n---\n`;
appendFileSync(crashLog, record, 'utf-8');
this.log('warn', `Daemon crashed (${kind}): ${message} — see ${crashLog}`);
}
/**
* Path to the on-disk children registry — list of headless worker
* child PIDs the daemon currently owns. #1855: written on every
* execution:start / :complete / :error transition; read by the next
* lifetime to reap orphans after a hard crash.
*/
get childrenFile() {
return join(this.projectRoot, '.claude-flow', 'daemon-children.json');
}
/**
* #1856: detect workers that were mid-flight when the previous daemon
* lifetime ended. A mid-flight worker has `lastStartedAt > lastRun`
* (started after the last successful completion). On crash recovery
* we count these as failures so the run-counter math stays consistent
* (`runCount === successCount + failureCount`). Workers naturally
* retry at their next scheduled interval; we deliberately don't
* immediately re-run because the failure may have been deterministic.
*/
detectMidFlightFailures() {
let detected = 0;
for (const [type, state] of this.workers.entries()) {
const startedAt = state.lastStartedAt?.getTime() ?? 0;
const lastRunAt = state.lastRun?.getTime() ?? 0;
// started after the last successful completion → was mid-flight
if (startedAt > 0 && startedAt > lastRunAt) {
state.failureCount++;
state.isRunning = false;
// Don't bump runCount — it was already incremented at start
this.log('info', `Worker ${type} was mid-flight at last crash (started ${state.lastStartedAt?.toISOString()}); counted as failure, will retry at next scheduled interval`);
detected++;
}
}
if (detected > 0) {
this.saveState();
}
}
/**
* Snapshot the currently-active headless worker child PIDs to disk.
* Best-effort; failures don't propagate.
*/
writeChildrenSnapshot() {
if (!this.headlessExecutor)
return;
try {
const pids = this.headlessExecutor.getActiveChildPids();
const dir = join(this.projectRoot, '.claude-flow');
if (!existsSync(dir))
mkdirSync(dir, { recursive: true });
writeFileSync(this.childrenFile, JSON.stringify({ pids, daemonPid: process.pid, timestamp: new Date().toISOString() }, null, 2), 'utf-8');
}
catch { /* best-effort */ }
}
/**
* #1855: reap orphan headless worker children left behind by a
* previous crashed lifetime. Reads `.claude-flow/daemon-children.json`,
* SIGTERMs any PID still alive that doesn't belong to the current
* daemon, then truncates the file. Called at the top of `start()`
* so the next lifetime starts with a clean process tree.
*/
reapOrphanedChildren() {
const file = this.childrenFile;
if (!existsSync(file))
return;
let snapshot;
try {
snapshot = JSON.parse(readFileSync(file, 'utf-8'));
}
catch {
try {
unlinkSync(file);
}
catch { /* ignore */ }
return;
}
const pids = Array.isArray(snapshot.pids) ? snapshot.pids : [];
let reaped = 0;
for (const pid of pids) {
if (typeof pid !== 'number' || pid <= 0)
continue;
if (pid === process.pid)
continue; // never our own PID
try {
process.kill(pid, 0); // is alive?
process.kill(pid, 'SIGTERM');
reaped++;
}
catch {
// already dead — fine
}
}
if (reaped > 0) {
this.log('info', `Reaped ${reaped} orphan headless worker child(ren) from previous lifetime`);
}
try {
unlinkSync(file);
}
catch { /* ignore */ }
}
/**
* Check if system resources allow worker execution
*/
async canRunWorker() {
const os = await import('os');
const cpuLoad = os.loadavg()[0];
const totalMem = os.totalmem();
const freeMem = os.freemem();
const freePercent = (freeMem / totalMem) * 100;
if (cpuLoad > this.config.resourceThresholds.maxCpuLoad) {
return { allowed: false, reason: `CPU load too high: ${cpuLoad.toFixed(2)}` };
}
if (freePercent < this.config.resourceThresholds.minFreeMemoryPercent) {
return { allowed: false, reason: `Memory too low: ${freePercent.toFixed(1)}% free` };
}
return { allowed: true };
}
/**
* Process pending workers queue
*
* When executeWorkerWithConcurrencyControl defers a worker (returns null),
* we break immediately to avoid a busy-wait loop — the deferred worker is
* already back on the pendingWorkers queue by that point. If no workers are
* currently running when we break, we schedule a backoff retry so the queue
* does not get permanently stuck.
*/
async processPendingWorkers() {
while (this.pendingWorkers.length > 0 && this.runningWorkers.size < this.config.maxConcurrent) {
const workerType = this.pendingWorkers.shift();
const workerConfig = this.config.workers.find(w => w.type === workerType);
if (workerConfig) {
const result = await this.executeWorkerWithConcurrencyControl(workerConfig);
if (result === null) {
// Worker was deferred (resource pressure or concurrency limit).
// Break to avoid tight-looping — the next executeWorker() completion
// will call processPendingWorkers() again via the finally block.
if (this.runningWorkers.size === 0) {
// No workers running means nobody will trigger the finally-block
// callback, so schedule a backoff retry to avoid a stuck queue.
setTimeout(() => this.processPendingWorkers(), 30_000).unref();
}
break;
}
}
}
}
initializeWorkerStates() {
// Try to restore state from file
if (existsSync(this.config.stateFile)) {
try {
const saved = JSON.parse(readFileSync(this.config.stateFile, 'utf-8'));
// CRITICAL: Restore worker config (including enabled flag) from saved state
// This fixes #950: daemon enable command not persisting worker state
if (saved.config?.workers && Array.isArray(saved.config.workers)) {
for (const savedWorker of saved.config.workers) {
const workerConfig = this.config.workers.find(w => w.type === savedWorker.type);
if (workerConfig && typeof savedWorker.enabled === 'boolean') {
workerConfig.enabled = savedWorker.enabled;
}
}
}
// Restore resourceThresholds, maxConcurrent, workerTimeoutMs from saved state
// Only restore if valid numeric values within sane ranges
if (saved.config?.resourceThresholds && !this.originalConfig?.resourceThresholds) {
const rt = saved.config.resourceThresholds;
if (typeof rt.maxCpuLoad === 'number' && rt.maxCpuLoad > 0 && rt.maxCpuLoad < 1000) {
this.config.resourceThresholds.maxCpuLoad = rt.maxCpuLoad;
}
if (typeof rt.minFreeMemoryPercent === 'number' && rt.minFreeMemoryPercent >= 0 && rt.minFreeMemoryPercent <= 100) {
this.config.resourceThresholds.minFreeMemoryPercent = rt.minFreeMemoryPercent;
}
}
if (typeof saved.config?.maxConcurrent === 'number' && saved.config.maxConcurrent > 0) {
this.config.maxConcurrent = saved.config.maxConcurrent;
}
if (typeof saved.config?.workerTimeoutMs === 'number' && saved.config.workerTimeoutMs > 0) {
this.config.workerTimeoutMs = saved.config.workerTimeoutMs;
}
// Restore worker runtime states (runCount, successCount, etc.)
if (saved.workers) {
for (const [type, state] of Object.entries(saved.workers)) {
const savedState = state;
const lastRunValue = savedState.lastRun;
const lastStartedAtValue = savedState.lastStartedAt;
this.workers.set(type, {
runCount: savedState.runCount || 0,
successCount: savedState.successCount || 0,
failureCount: savedState.failureCount || 0,
averageDurationMs: savedState.averageDurationMs || 0,
lastRun: lastRunValue ? new Date(lastRunValue) : undefined,
lastStartedAt: lastStartedAtValue ? new Date(lastStartedAtValue) : undefined,
nextRun: undefined,
isRunning: false,
});
}
}
}
catch {
// Ignore parse errors, start fresh
}
}
// Initialize any missing workers
for (const workerConfig of this.config.workers) {
if (!this.workers.has(workerConfig.type)) {
this.workers.set(workerConfig.type, {
runCount: 0,
successCount: 0,
failureCount: 0,
averageDurationMs: 0,
isRunning: false,
});
}
}
}
/**
* Get the PID file path for singleton enforcement (#1395 Bug 3).
*/
get pidFile() {
return join(this.projectRoot, '.claude-flow', 'daemon.pid');
}
/**
* Check if another daemon instance is already running.
* Returns the existing PID if alive, or null if no daemon is running.
*
* #1853: ignore self-PID matches. The detached-spawn path in
* `commands/daemon.ts` writes the child's PID into the file as a
* fallback after a 500ms wait. If the child reaches `start()` slower
* than the parent's 500ms wait (observed on Node 25 / macOS 26), the
* child reads its own PID back from the file and concludes "another
* daemon is already running" — so it exits before scheduling workers
* and `daemon status` reports STOPPED forever. A daemon process is
* never "another instance" of itself; treat self-match as absence.
*/
checkExistingDaemon() {
if (!existsSync(this.pidFile))
return null;
try {
const pid = parseInt(readFileSync(this.pidFile, 'utf-8').trim(), 10);
if (isNaN(pid))
return null;
// #1853: a PID file containing our own PID is not "another daemon".
// Treat as absent so the start() path proceeds normally.
if (pid === process.pid)
return null;
// Check if process is alive (signal 0 = existence check)
process.kill(pid, 0);
return pid; // Process is alive
}
catch {
// Process is dead — clean up stale PID file
try {
unlinkSync(this.pidFile);
}
catch { /* ignore */ }
return null;
}
}
/**
* Write PID file for singleton enforcement.
*/
writePidFile() {
const dir = join(this.projectRoot, '.claude-flow');
if (!existsSync(dir))
mkdirSync(dir, { recursive: true });
writeFileSync(this.pidFile, String(process.pid), 'utf-8');
}
/**
* Remove PID file on shutdown.
*/
removePidFile() {
try {
unlinkSync(this.pidFile);
}
catch { /* ignore */ }
}
/**
* Start the daemon and all enabled workers
*/
async start() {
if (this.running) {
this.emit('warning', 'Daemon already running');
return;
}
// PID singleton enforcement (#1395 Bug 3): prevent daemon accumulation
const existingPid = this.checkExistingDaemon();
if (existingPid !== null) {
this.log('info', `Daemon already running (PID: ${existingPid}), skipping start`);
this.emit('warning', `Daemon already running (PID: ${existingPid})`);
return;
}
// #1855: reap orphan headless worker children left by a previous
// crashed lifetime, BEFORE we mark ourselves running and start
// accepting new work. The children file from the prior daemon's
// last-snapshot is the authoritative list.
this.reapOrphanedChildren();
// #1856: detect workers that were mid-flight at the previous crash
// and count them as failures so runCount/successCount/failureCount
// stay consistent. Workers retry naturally at their next scheduled
// interval — we don't immediately re-run them, which avoids a
// freshly-recovered daemon hammering the same code path that just
// killed it.
this.detectMidFlightFailures();
this.running = true;
this.startedAt = new Date();
this.writePidFile();
this.emit('started', { pid: process.pid, startedAt: this.startedAt });
// Schedule all enabled workers
for (const workerConfig of this.config.workers) {
if (workerConfig.enabled) {
this.scheduleWorker(workerConfig);
}
}
// #1845: poll the MCP-dispatch queue directory so workers requested
// via mcp__hooks_worker-dispatch (in a separate process) actually
// execute here. Previously the dispatch wrote to a process-local Map
// that the daemon could never see.
this.queuePollTimer = setInterval(() => {
void this.processDispatchQueue();
}, 5_000);
if (typeof this.queuePollTimer.unref === 'function') {
this.queuePollTimer.unref();
}
// Save state
this.saveState();
this.log('info', `Daemon started (PID: ${process.pid}, CPUs: ${cpus().length}, workers: ${this.config.workers.filter(w => w.enabled).length}, maxCpuLoad: ${this.config.resourceThresholds.maxCpuLoad}, minFreeMemoryPercent: ${this.config.resourceThresholds.minFreeMemoryPercent}%)`);
}
/**
* #1845: ingest queue entries written by mcp__hooks_worker-dispatch.
* Each entry is a JSON file at `.claude-flow/daemon-queue/<id>.json`
* with `{ workerId, trigger, context, enqueuedAt }`. We move processed
* files to `.claude-flow/daemon-queue/.processed/` so the daemon never
* re-runs the same dispatch and operators can inspect history.
*/
async processDispatchQueue() {
if (!this.running)
return;
const queueDir = join(this.projectRoot, '.claude-flow', 'daemon-queue');
if (!existsSync(queueDir))
return;
let entries;
try {
const fs = await import('fs');
entries = fs.readdirSync(queueDir).filter((n) => n.endsWith('.json'));
}
catch {
return;
}
if (entries.length === 0)
return;
const fs = await import('fs');
const processedDir = join(queueDir, '.processed');
if (!existsSync(processedDir)) {
try {
fs.mkdirSync(processedDir, { recursive: true });
}
catch { /* race ok */ }
}
for (const entry of entries) {
const src = join(queueDir, entry);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let payload;
try {
payload = JSON.parse(fs.readFileSync(src, 'utf-8'));
}
catch {
// Malformed entry — quarantine so we don't loop on it
try {
fs.renameSync(src, join(processedDir, `bad-${entry}`));
}
catch { /* nothing more we can do */ }
continue;
}
const trigger = payload?.trigger;
const workerId = payload?.workerId;
if (!trigger || !this.config.workers.some((w) => w.type === trigger)) {
try {
fs.renameSync(src, join(processedDir, `unknown-${entry}`));
}
catch { /* ok */ }
continue;
}
try {
this.log('info', `Dequeued ${trigger}${workerId ? ` (id=${workerId})` : ''} from MCP dispatch queue`);
await this.triggerWorker(trigger);
}
catch (err) {
this.log('warn', `Queued worker ${trigger} failed: ${err.message}`);
}
finally {
try {
fs.renameSync(src, join(processedDir, entry));
}
catch { /* ignore */ }
}
}
}
/**
* Stop the daemon and all workers
*/
async stop() {
if (!this.running) {
this.emit('warning', 'Daemon not running');
return;
}
// Clear all timers (convert to array to avoid iterator issues)
const timerEntries = Array.from(this.timers.entries());
for (const [type, timer] of timerEntries) {
clearTimeout(timer);
this.log('info', `Stopped worker: ${type}`);
}
this.timers.clear();
// #1845: stop the MCP-dispatch queue poller too.
if (this.queuePollTimer) {
clearInterval(this.queuePollTimer);
this.queuePollTimer = undefined;
}
this.running = false;
this.removePidFile();
this.saveState();
this.emit('stopped', { stoppedAt: new Date() });
this.log('info', 'Daemon stopped');
}
/**
* Get daemon status
*/
getStatus() {
return {
running: this.running,
pid: process.pid,
startedAt: this.startedAt,
workers: new Map(this.workers),
config: this.config,
};
}
/**
* Schedule a worker to run at intervals with staggered start
*/
scheduleWorker(workerConfig) {
const state = this.workers.get(workerConfig.type);
const internalConfig = workerConfig;
const staggerOffset = internalConfig.offsetMs || 0;
// Calculate initial delay with stagger offset
let initialDelay = staggerOffset;
if (state.lastRun) {
const timeSinceLastRun = Date.now() - state.lastRun.getTime();
initialDelay = Math.max(staggerOffset, workerConfig.intervalMs - timeSinceLastRun);
}
state.nextRun = new Date(Date.now() + initialDelay);
const runAndReschedule = async () => {
if (!this.running)
return;
// Use concurrency-controlled execution (P0 fix)
await this.executeWorkerWithConcurrencyControl(workerConfig);
// Reschedule
if (this.running) {
const timer = setTimeout(runAndReschedule, workerConfig.intervalMs);
this.timers.set(workerConfig.type, timer);
state.nextRun = new Date(Date.now() + workerConfig.intervalMs);
}
};
// Schedule first run with stagger offset
const timer = setTimeout(runAndReschedule, initialDelay);
this.timers.set(workerConfig.type, timer);
this.log('info', `Scheduled ${workerConfig.type} (interval: ${workerConfig.intervalMs / 1000}s, first run in ${initialDelay / 1000}s)`);
}
/**
* Execute a worker with concurrency control (P0 fix)
*/
async executeWorkerWithConcurrencyControl(workerConfig) {
// Check concurrency limit
if (this.runningWorkers.size >= this.config.maxConcurrent) {
this.log('info', `Worker ${workerConfig.type} deferred: max concurrent (${this.config.maxConcurrent}) reached`);
this.pendingWorkers.push(workerConfig.type);
this.emit('worker:deferred', { type: workerConfig.type, reason: 'max_concurrent' });
return null;
}
// Check resource availability
const resourceCheck = await this.canRunWorker();
if (!resourceCheck.allowed) {
this.log('info', `Worker ${workerConfig.type} deferred: ${resourceCheck.reason}`);
this.pendingWorkers.push(workerConfig.type);
this.emit('worker:deferred', { type: workerConfig.type, reason: resourceCheck.reason });
return null;
}
return this.executeWorker(workerConfig);
}
/**
* Execute a worker with timeout protection
*/
async executeWorker(workerConfig) {
const state = this.workers.get(workerConfig.type);
const workerId = `${workerConfig.type}_${Date.now()}`;
const startTime = Date.now();
// Track running worker
this.runningWorkers.add(workerConfig.type);
state.isRunning = true;
state.lastStartedAt = new Date(); // #1856: timestamp the start
this.saveState(); // persist before we run anything
this.emit('worker:start', { workerId, type: workerConfig.type });
this.log('info', `Starting worker: ${workerConfig.type} (${this.runningWorkers.size}/${this.config.maxConcurrent} concurrent)`);
try {
// Execute worker logic with timeout (P1 fix)
// Pass cleanup callback to kill orphan child processes on timeout (#1117)
const output = await this.runWithTimeout(() => this.runWorkerLogic(workerConfig), this.config.workerTimeoutMs, `Worker ${workerConfig.type} timed out after ${this.config.workerTimeoutMs / 1000}s`, () => {
// On timeout, cancel any headless execution to prevent orphan processes
if (this.headlessExecutor) {
this.headlessExecutor.cancelAll();
}
});
const durationMs = Date.now() - startTime;
// Update state
state.runCount++;
state.successCount++;
state.lastRun = new Date();
state.averageDurationMs = (state.averageDurationMs * (state.runCount - 1) + durationMs) / state.runCount;
state.isRunning = false;
const result = {
workerId,
type: workerConfig.type,
success: true,
durationMs,
output,
timestamp: new Date(),
};
this.emit('worker:complete', result);
this.log('info', `Worker ${workerConfig.type} completed in ${durationMs}ms`);
this.saveState();
return result;
}
catch (error) {
const durationMs = Date.now() - startTime;
state.runCount++;
state.failureCount++;
state.lastRun = new Date();
state.isRunning = false;
const result = {
workerId,
type: workerConfig.type,
success: false,
durationMs,
error: error instanceof Error ? error.message : String(error),
timestamp: new Date(),
};
this.emit('worker:error', result);
this.log('error', `Worker ${workerConfig.type} failed: ${result.error}`);
this.saveState();
return result;
}
finally {
// Remove from running set and process queue
this.runningWorkers.delete(workerConfig.type);
this.processPendingWorkers();
}
}
/**
* Run a function with timeout (P1 fix)
* @param fn - The async function to execute
* @param timeoutMs - Timeout in milliseconds
* @param timeoutMessage - Error message on timeout
* @param onTimeout - Optional cleanup callback invoked when timeout fires (#1117: kills orphan processes)
*/
async runWithTimeout(fn, timeoutMs, timeoutMessage, onTimeout) {
return new Promise((resolve, reject) => {
let settled = false;
const timer = setTimeout(() => {
if (settled)
return;
settled = true;
// Kill orphan child processes before rejecting (#1117)
if (onTimeout) {
try {
onTimeout();
}
catch {
// Ignore cleanup errors
}
}
reject(new Error(timeoutMessage));
}, timeoutMs);
fn()
.then((result) => {
if (settled)
return;
settled = true;
clearTimeout(timer);
resolve(result);
})
.catch((error) => {
if (settled)
return;
settled = true;
clearTimeout(timer);
reject(error);
});
});
}
/**
* Run the actual worker logic
*/
async runWorkerLogic(workerConfig) {
// Check if this is a headless worker type and headless execution is available
if (isHeadlessWorker(workerConfig.type) && this.headlessAvailable && this.headlessExecutor) {
try {
this.log('info', `Running ${workerConfig.type} in headless mode (Claude Code AI)`);
const result = await this.headlessExecutor.execute(workerConfig.type);
// #2110 — `HeadlessWorkerExecutor.execute()` returns
// `createErrorResult(...)` with `success: false` when
// `isAvailable()` is false, instead of throwing. The previous
// try/catch never fired in that path, and the result was
// persisted as mode:"headless" despite being a stub. Downstream
// dashboards / `memory stats` couldn't distinguish a real AI
// run from a fallback. Treat falsy success the same as throw.
const ok = result?.success === true;
if (!ok) {
const reason = result?.error ||
result?.note ||
'headless executor reported success=false';
this.log('warn', `Headless ${workerConfig.type} returned success=false (${String(reason).slice(0, 200)}); falling back to local mode`);
this.emit('headless:fallback', {
type: workerConfig.type,
error: String(reason).slice(0, 500),
});
// Fall through to local switch.
}
else {
// #1793: persist the headless result to the same metrics files the
// local workers write to. Without this, AI-mode runs produced rich
// parsedOutput that lived only in `.claude-flow/logs/headless/*` and
// never reached `.claude-flow/metrics/<name>.json` — `memory stats`
// and downstream consumers saw nothing despite successful runs.
try {
this.persistHeadlessResult(workerConfig.type, result);
}
catch (persistError) {
this.log('warn', `Failed to persist headless result for ${workerConfig.type}: ${persistError.message}`);
}
return {
mode: 'headless',
...result,
};
}
}
catch (error) {
this.log('warn', `Headless execution failed for ${workerConfig.type}, falling back to local mode`);
this.emit('headless:fallback', {
type: workerConfig.type,
error: error instanceof Error ? error.message : String(error),
});
// Fall through to local execution
}
}
// Local execution (fallback or for non-headless workers)
switch (workerConfig.type) {
case 'map':
return this.runMapWorker();
case 'audit':
return this.runAuditWorkerLocal();
case 'optimize':
return this.runOptimizeWorkerLocal();
case 'consolidate':
return this.runConsolidateWorker();
case 'testgaps':
return this.runTestGapsWorkerLocal();
case 'predict':
return this.runPredictWorkerLocal();
case 'document':
return this.runDocumentWorkerLocal();
case 'ultralearn':
return this.runUltralearnWorkerLocal();
case 'refactor':
return this.runRefactorWorkerLocal();
case 'deepdive':
return this.runDeepdiveWorkerLocal();
case 'benchmark':
return this.runBenchmarkWorkerLocal();
case 'preload':
return this.runPreloadWorkerLocal();
default:
return { status: 'unknown worker type', mode: 'local' };
}
}
/**
* #1793: persist a headless worker result to the same metrics file the
* local fallback writes to. Without this, AI-mode workers produced rich
* structured output (audit findings, perf signals, test-gap analysis)
* that lived only in `.claude-flow/logs/headless/*_result.log` and was
* invisible to `npx ruflo memory stats` or the metrics consumers.
*
* The mapping mirrors the `*Local` worker implementations below so a
* single consumer path works regardless of execution mode.
*/
persistHeadlessResult(workerType, result) {
const metricsDir = join(this.projectRoot, '.claude-flow', 'metrics');
if (!existsSync(metricsDir))
mkdirSync(metricsDir, { recursive: true });
// Filename mirrors the local-mode worker writes (security-audit.json,
// performance.json, test-gaps.json) so a downstream reader doesn't
// care which mode produced the data.
const filenameMap = {
audit: 'security-audit.json',
optimize: 'performance.json',
testgaps: 'test-gaps.json',
document: 'documentation.json',
refactor: 'refactor.json',
deepdive: 'deepdive.json',
ultralearn: 'ultralearn.json',
predict: 'predictions.json',
};
const filename = filenameMap[workerType] ?? `${workerType}.json`;
const metricsFile = join(metricsDir, filename);
const persisted = {
timestamp: result.timestamp instanceof Date ? result.timestamp.toISOString() : new Date().toISOString(),
mode: 'headless',
workerType,
model: result.model,
durationMs: result.durationMs,
tokensUsed: result.tokensUsed,
executionId: result.executionId,
success: result.success,
// Structured findings live here when the worker emits JSON (e.g. the
// audit worker's vulnerability list). Fall back to a raw-output
// pointer so consumers can still locate the full log.
findings: result.parsedOutput ?? null,
rawOutputPreview: typeof result.output === 'string' ? result.output.slice(0, 2000) : undefined,
rawOutputLength: typeof result.output === 'string' ? result.output.length : 0,
};
writeFileSync(metricsFile, JSON.stringify(persisted, null, 2));
}
// Worker implementations
async runMapWorker() {
// Scan project structure and update metrics
const metricsFile = join(this.projectRoot, '.claude-flow', 'metrics', 'codebase-map.json');
const metricsDir = join(this.projectRoot, '.claude-flow', 'metrics');
if (!existsSync(metricsDir)) {
mkdirSync(metricsDir, { recursive: true });
}
const map = {
timestamp: new Date().toISOString(),
projectRoot: this.projectRoot,
structure: {
hasPackageJson: existsSync(join(this.projectRoot, 'package.json')),
hasTsConfig: existsSync(join(this.projectRoot, 'tsconfig.json')),
hasClaudeConfig: existsSync(join(this.projectRoot, '.claude')),
hasClaudeFlow: existsSync(join(this.projec