UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

1,083 lines 60 kB
/** * Worker Daemon Service * Node.js-based background worker system that auto-runs like shell daemons * * Workers: * - map: Codebase mapping (5 min interval) * - audit: Security analysis (10 min interval) * - optimize: Performance optimization (15 min interval) * - consolidate: Memory consolidation (30 min interval) * - testgaps: Test coverage analysis (20 min interval) */ import { EventEmitter } from 'events'; import { existsSync, mkdirSync, writeFileSync, readFileSync, appendFileSync, unlinkSync, renameSync } from 'fs'; import { cpus } from 'os'; import { join } from 'path'; import { HeadlessWorkerExecutor, isHeadlessWorker, } from './headless-worker-executor.js'; // Default worker configurations with improved intervals (P0 fix: map 5min -> 15min) const DEFAULT_WORKERS = [ { type: 'map', intervalMs: 15 * 60 * 1000, offsetMs: 0, priority: 'normal', description: 'Codebase mapping', enabled: true }, { type: 'audit', intervalMs: 10 * 60 * 1000, offsetMs: 2 * 60 * 1000, priority: 'critical', description: 'Security analysis', enabled: true }, { type: 'optimize', intervalMs: 15 * 60 * 1000, offsetMs: 4 * 60 * 1000, priority: 'high', description: 'Performance optimization', enabled: true }, { type: 'consolidate', intervalMs: 30 * 60 * 1000, offsetMs: 6 * 60 * 1000, priority: 'low', description: 'Memory consolidation', enabled: true }, { type: 'testgaps', intervalMs: 20 * 60 * 1000, offsetMs: 8 * 60 * 1000, priority: 'normal', description: 'Test coverage analysis', enabled: true }, { type: 'predict', intervalMs: 10 * 60 * 1000, offsetMs: 0, priority: 'low', description: 'Predictive preloading', enabled: false }, { type: 'document', intervalMs: 60 * 60 * 1000, offsetMs: 0, priority: 'low', description: 'Auto-documentation', enabled: false }, ]; // Worker timeout — must exceed the longest per-worker headless timeout (15 min for audit/refactor). // Previously 5 min, which caused orphan processes when daemon timeout fired before executor timeout (#1117). const DEFAULT_WORKER_TIMEOUT_MS = 16 * 60 * 1000; /** * Worker Daemon - Manages background workers with Node.js */ export class WorkerDaemon extends EventEmitter { config; workers = new Map(); timers = new Map(); // #1845: separate timer for the MCP-dispatch queue poller. Kept off // the per-worker map so stop() clears both kinds without confusion. queuePollTimer; running = false; startedAt; projectRoot; runningWorkers = new Set(); // Track concurrent workers pendingWorkers = []; // Queue for deferred workers // Headless execution support headlessExecutor = null; headlessAvailable = false; // Preserve the original constructor config so we can detect explicit overrides // during state restoration (R1: constructor config takes priority over stale state) originalConfig; constructor(projectRoot, config) { super(); this.projectRoot = projectRoot; this.originalConfig = config; const claudeFlowDir = join(projectRoot, '.claude-flow'); // Read daemon config from .claude-flow/config.json (Layer B) const fileConfig = this.readDaemonConfigFromFile(claudeFlowDir); // CPU-proportional smart default instead of hardcoded 2.0 const cpuCount = WorkerDaemon.getEffectiveCpuCount(); let smartMaxCpuLoad = Math.max(cpuCount * 0.8, 2.0); // Floor of 2.0 for single-CPU machines // #2110 — WSL2 reports `/proc/loadavg` values that include Windows-side // process counts mapped into the Linux kernel. Real load on a 4-CPU // WSL2 host can be 200-400 even when the Linux side is idle. The // default gate of `cpuCount * 0.8` always trips, deferring every // worker as "CPU load too high" while the daemon reports healthy. // Bump the floor to 1000 when WSL is detected so the gate is // effectively disabled (real load on Linux side rarely exceeds 100 // even under heavy contention). if (WorkerDaemon.isWslEnvironment()) { smartMaxCpuLoad = Math.max(smartMaxCpuLoad, 1000); } // Platform-aware default: macOS os.freemem() excludes reclaimable file cache, // so reported "free" is much lower than actually available memory. // Linux reports available memory (including reclaimable cache) more accurately. const defaultMinFreeMemory = process.platform === 'darwin' ? 5 : 10; // Priority: constructor arg > config.json > smart default // For resourceThresholds, merge field-by-field so partial overrides // (e.g. only --max-cpu-load) still pick up defaults for other fields. this.config = { autoStart: config?.autoStart ?? fileConfig.autoStart ?? false, logDir: config?.logDir ?? join(claudeFlowDir, 'logs'), stateFile: config?.stateFile ?? join(claudeFlowDir, 'daemon-state.json'), maxConcurrent: config?.maxConcurrent ?? fileConfig.maxConcurrent ?? 2, workerTimeoutMs: config?.workerTimeoutMs ?? fileConfig.workerTimeoutMs ?? DEFAULT_WORKER_TIMEOUT_MS, resourceThresholds: { maxCpuLoad: config?.resourceThresholds?.maxCpuLoad ?? fileConfig.maxCpuLoad ?? smartMaxCpuLoad, minFreeMemoryPercent: config?.resourceThresholds?.minFreeMemoryPercent ?? fileConfig.minFreeMemoryPercent ?? defaultMinFreeMemory, }, workers: config?.workers ?? DEFAULT_WORKERS, }; // Setup graceful shutdown handlers this.setupShutdownHandlers(); // #1855: install crash handlers so uncaught exceptions and unhandled // rejections don't leak the PID file or orphan child processes. this.installCrashHandlers(); // Ensure directories exist if (!existsSync(claudeFlowDir)) { mkdirSync(claudeFlowDir, { recursive: true }); } if (!existsSync(this.config.logDir)) { mkdirSync(this.config.logDir, { recursive: true }); } // Initialize worker states this.initializeWorkerStates(); // Initialize headless executor (async, non-blocking) this.initHeadlessExecutor().catch((err) => { this.log('warn', `Headless executor init failed: ${err}`); }); } /** * Initialize headless executor if Claude Code is available */ async initHeadlessExecutor() { try { this.headlessExecutor = new HeadlessWorkerExecutor(this.projectRoot, { maxConcurrent: this.config.maxConcurrent, }); this.headlessAvailable = await this.headlessExecutor.isAvailable(); if (this.headlessAvailable) { this.log('info', 'Claude Code headless mode available - AI workers enabled'); // Forward headless executor events. #1855: also snapshot the // active child PIDs to disk on every transition so the next // lifetime can reap orphans after a hard crash. this.headlessExecutor.on('execution:start', (data) => { this.writeChildrenSnapshot(); this.emit('headless:start', data); }); this.headlessExecutor.on('execution:complete', (data) => { this.writeChildrenSnapshot(); this.emit('headless:complete', data); }); this.headlessExecutor.on('execution:error', (data) => { this.writeChildrenSnapshot(); this.emit('headless:error', data); }); this.headlessExecutor.on('output', (data) => { this.emit('headless:output', data); }); } else { this.log('info', 'Claude Code not found - AI workers will run in local fallback mode'); } } catch (error) { this.log('warn', `Failed to initialize headless executor: ${error}`); this.headlessAvailable = false; } } /** * Check if headless execution is available */ isHeadlessAvailable() { return this.headlessAvailable; } /** * Get headless executor instance */ getHeadlessExecutor() { return this.headlessExecutor; } /** * Detect effective CPU count for the current environment. * * Inside Docker / K8s containers, os.cpus().length reports the HOST cpu * count, not the container limit (Node.js #28762 — wontfix). We read * cgroup v2 / v1 quota files first so the maxCpuLoad threshold stays * meaningful under resource-limited containers. */ /** * #2110 — detect WSL2 / WSL1 so the CPU-load gate can use a sane * default. `/proc/loadavg` on WSL maps in Windows-side process counts * and routinely reports values 100-1000x larger than real Linux load. * * Detection order: * 1. `WSL_DISTRO_NAME` env var (set by Microsoft's WSL launcher) * 2. `WSL_INTEROP` env var (set by recent WSL2) * 3. `/proc/sys/kernel/osrelease` contains "microsoft" or "WSL" * (kernel build marker; survives env stripping) */ static isWslEnvironment() { if (process.env.WSL_DISTRO_NAME || process.env.WSL_INTEROP) return true; try { const osrelease = readFileSync('/proc/sys/kernel/osrelease', 'utf8').toLowerCase(); if (osrelease.includes('microsoft') || osrelease.includes('wsl')) return true; } catch { /* not on Linux or /proc inaccessible */ } return false; } static getEffectiveCpuCount() { // 1. Try cgroup v2: /sys/fs/cgroup/cpu.max try { const cpuMax = readFileSync('/sys/fs/cgroup/cpu.max', 'utf8').trim(); const [quotaStr, periodStr] = cpuMax.split(' '); if (quotaStr !== 'max') { const quota = parseInt(quotaStr, 10); const period = parseInt(periodStr, 10); if (quota > 0 && period > 0) return Math.ceil(quota / period); } } catch { /* not in cgroup v2 */ } // 2. Try cgroup v1: /sys/fs/cgroup/cpu/cpu.cfs_quota_us try { const quota = parseInt(readFileSync('/sys/fs/cgroup/cpu/cpu.cfs_quota_us', 'utf8').trim(), 10); const period = parseInt(readFileSync('/sys/fs/cgroup/cpu/cpu.cfs_period_us', 'utf8').trim(), 10); if (quota > 0 && period > 0) return Math.ceil(quota / period); } catch { /* not in cgroup v1 */ } // 3. Fallback to os.cpus().length return cpus().length || 1; } /** * Read daemon-specific config from .claude-flow/config.{json,yaml,yml}. * Supports dot-notation keys like 'daemon.resourceThresholds.maxCpuLoad'. * #1844: prefer JSON when both exist (existing behavior) but fall back * to YAML so operators using the v3 canonical YAML format aren't silently * ignored. The chosen path is logged at info level. */ readDaemonConfigFromFile(claudeFlowDir) { const jsonPath = join(claudeFlowDir, 'config.json'); const yamlPath = join(claudeFlowDir, 'config.yaml'); const ymlPath = join(claudeFlowDir, 'config.yml'); // eslint-disable-next-line @typescript-eslint/no-explicit-any let raw; let chosenPath; if (existsSync(jsonPath)) { try { raw = JSON.parse(readFileSync(jsonPath, 'utf-8')); chosenPath = jsonPath; } catch { return {}; } } else if (existsSync(yamlPath) || existsSync(ymlPath)) { const yPath = existsSync(yamlPath) ? yamlPath : ymlPath; try { // Lazy-load yaml so the daemon doesn't hard-require it; if the // dep isn't installed, fall back to the previous warn-only path. // eslint-disable-next-line @typescript-eslint/no-var-requires const yamlMod = require('yaml'); const parsed = yamlMod.parse(readFileSync(yPath, 'utf-8')); if (parsed && typeof parsed === 'object') { // eslint-disable-next-line @typescript-eslint/no-explicit-any raw = parsed; chosenPath = yPath; } } catch { this.log('warn', `Found ${yPath} but yaml parser unavailable. Install \`yaml\` or convert to JSON. Falling back to defaults.`); return {}; } } if (!raw || !chosenPath) { return {}; } this.log('info', `Daemon config loaded from ${chosenPath}`); try { // Support both flat keys at root and nested under scopes.project const cfg = raw?.scopes?.project ?? raw; const rawCpuLoad = cfg['daemon.resourceThresholds.maxCpuLoad'] ?? raw['daemon.resourceThresholds.maxCpuLoad']; const rawMinMem = cfg['daemon.resourceThresholds.minFreeMemoryPercent'] ?? raw['daemon.resourceThresholds.minFreeMemoryPercent']; const rawMaxConcurrent = cfg['daemon.maxConcurrent'] ?? raw['daemon.maxConcurrent']; const rawTimeout = cfg['daemon.workerTimeoutMs'] ?? raw['daemon.workerTimeoutMs']; return { autoStart: typeof raw['daemon.autoStart'] === 'boolean' ? raw['daemon.autoStart'] : undefined, maxConcurrent: (typeof rawMaxConcurrent === 'number' && rawMaxConcurrent > 0) ? rawMaxConcurrent : undefined, workerTimeoutMs: (typeof rawTimeout === 'number' && rawTimeout > 0) ? rawTimeout : undefined, maxCpuLoad: (typeof rawCpuLoad === 'number' && rawCpuLoad > 0 && rawCpuLoad < 1000) ? rawCpuLoad : undefined, minFreeMemoryPercent: (typeof rawMinMem === 'number' && rawMinMem >= 0 && rawMinMem <= 100) ? rawMinMem : undefined, }; } catch { return {}; } } /** * Setup graceful shutdown handlers */ setupShutdownHandlers() { const shutdown = async () => { this.log('info', 'Received shutdown signal, stopping daemon...'); await this.stop(); process.exit(0); }; process.on('SIGTERM', shutdown); process.on('SIGINT', shutdown); process.on('SIGHUP', shutdown); } /** * #1855: install crash handlers for uncaught exceptions and unhandled * rejections. Without these, a thrown error from any timer callback, * worker logic path, or transitive import crashes the daemon process * silently — the PID file leaks and any in-flight child processes * orphan. With these, we log a structured crash record, run stop() * to clean up, then exit 1 so the process actually dies (otherwise * Node would crash anyway after the handler returns). */ installCrashHandlers() { const onCrash = (kind, err) => { // Best-effort logging; never throw from inside the crash handler. try { this.writeCrashRecord(kind, err); } catch { /* nothing more we can do */ } try { // Synchronous stop — don't await; the process is dying. Just // remove the PID file and snapshot state so the next start // sees a clean slate. this.removePidFile(); this.saveState(); // Snapshot any in-flight child PIDs one last time so the next // lifetime can reap them. this.writeChildrenSnapshot(); } catch { /* ignore */ } // Exit non-zero so supervisors / shells see the failure. process.exit(1); }; process.on('uncaughtException', (err) => onCrash('uncaughtException', err)); process.on('unhandledRejection', (err) => onCrash('unhandledRejection', err)); } /** * Append a structured crash record to .claude-flow/logs/crash.log. * Inspectable by hand or via `ruflo daemon status` follow-ups. */ writeCrashRecord(kind, err) { const logDir = this.config.logDir; if (!existsSync(logDir)) mkdirSync(logDir, { recursive: true }); const crashLog = join(logDir, 'crash.log'); const ts = new Date().toISOString(); const message = err instanceof Error ? err.message : String(err); const stack = err instanceof Error && err.stack ? err.stack : '<no stack>'; const record = `[${ts}] [${kind}] pid=${process.pid} ${message}\n${stack}\n---\n`; appendFileSync(crashLog, record, 'utf-8'); this.log('warn', `Daemon crashed (${kind}): ${message} — see ${crashLog}`); } /** * Path to the on-disk children registry — list of headless worker * child PIDs the daemon currently owns. #1855: written on every * execution:start / :complete / :error transition; read by the next * lifetime to reap orphans after a hard crash. */ get childrenFile() { return join(this.projectRoot, '.claude-flow', 'daemon-children.json'); } /** * #1856: detect workers that were mid-flight when the previous daemon * lifetime ended. A mid-flight worker has `lastStartedAt > lastRun` * (started after the last successful completion). On crash recovery * we count these as failures so the run-counter math stays consistent * (`runCount === successCount + failureCount`). Workers naturally * retry at their next scheduled interval; we deliberately don't * immediately re-run because the failure may have been deterministic. */ detectMidFlightFailures() { let detected = 0; for (const [type, state] of this.workers.entries()) { const startedAt = state.lastStartedAt?.getTime() ?? 0; const lastRunAt = state.lastRun?.getTime() ?? 0; // started after the last successful completion → was mid-flight if (startedAt > 0 && startedAt > lastRunAt) { state.failureCount++; state.isRunning = false; // Don't bump runCount — it was already incremented at start this.log('info', `Worker ${type} was mid-flight at last crash (started ${state.lastStartedAt?.toISOString()}); counted as failure, will retry at next scheduled interval`); detected++; } } if (detected > 0) { this.saveState(); } } /** * Snapshot the currently-active headless worker child PIDs to disk. * Best-effort; failures don't propagate. */ writeChildrenSnapshot() { if (!this.headlessExecutor) return; try { const pids = this.headlessExecutor.getActiveChildPids(); const dir = join(this.projectRoot, '.claude-flow'); if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); writeFileSync(this.childrenFile, JSON.stringify({ pids, daemonPid: process.pid, timestamp: new Date().toISOString() }, null, 2), 'utf-8'); } catch { /* best-effort */ } } /** * #1855: reap orphan headless worker children left behind by a * previous crashed lifetime. Reads `.claude-flow/daemon-children.json`, * SIGTERMs any PID still alive that doesn't belong to the current * daemon, then truncates the file. Called at the top of `start()` * so the next lifetime starts with a clean process tree. */ reapOrphanedChildren() { const file = this.childrenFile; if (!existsSync(file)) return; let snapshot; try { snapshot = JSON.parse(readFileSync(file, 'utf-8')); } catch { try { unlinkSync(file); } catch { /* ignore */ } return; } const pids = Array.isArray(snapshot.pids) ? snapshot.pids : []; let reaped = 0; for (const pid of pids) { if (typeof pid !== 'number' || pid <= 0) continue; if (pid === process.pid) continue; // never our own PID try { process.kill(pid, 0); // is alive? process.kill(pid, 'SIGTERM'); reaped++; } catch { // already dead — fine } } if (reaped > 0) { this.log('info', `Reaped ${reaped} orphan headless worker child(ren) from previous lifetime`); } try { unlinkSync(file); } catch { /* ignore */ } } /** * Check if system resources allow worker execution */ async canRunWorker() { const os = await import('os'); const cpuLoad = os.loadavg()[0]; const totalMem = os.totalmem(); const freeMem = os.freemem(); const freePercent = (freeMem / totalMem) * 100; if (cpuLoad > this.config.resourceThresholds.maxCpuLoad) { return { allowed: false, reason: `CPU load too high: ${cpuLoad.toFixed(2)}` }; } if (freePercent < this.config.resourceThresholds.minFreeMemoryPercent) { return { allowed: false, reason: `Memory too low: ${freePercent.toFixed(1)}% free` }; } return { allowed: true }; } /** * Process pending workers queue * * When executeWorkerWithConcurrencyControl defers a worker (returns null), * we break immediately to avoid a busy-wait loop — the deferred worker is * already back on the pendingWorkers queue by that point. If no workers are * currently running when we break, we schedule a backoff retry so the queue * does not get permanently stuck. */ async processPendingWorkers() { while (this.pendingWorkers.length > 0 && this.runningWorkers.size < this.config.maxConcurrent) { const workerType = this.pendingWorkers.shift(); const workerConfig = this.config.workers.find(w => w.type === workerType); if (workerConfig) { const result = await this.executeWorkerWithConcurrencyControl(workerConfig); if (result === null) { // Worker was deferred (resource pressure or concurrency limit). // Break to avoid tight-looping — the next executeWorker() completion // will call processPendingWorkers() again via the finally block. if (this.runningWorkers.size === 0) { // No workers running means nobody will trigger the finally-block // callback, so schedule a backoff retry to avoid a stuck queue. setTimeout(() => this.processPendingWorkers(), 30_000).unref(); } break; } } } } initializeWorkerStates() { // Try to restore state from file if (existsSync(this.config.stateFile)) { try { const saved = JSON.parse(readFileSync(this.config.stateFile, 'utf-8')); // CRITICAL: Restore worker config (including enabled flag) from saved state // This fixes #950: daemon enable command not persisting worker state if (saved.config?.workers && Array.isArray(saved.config.workers)) { for (const savedWorker of saved.config.workers) { const workerConfig = this.config.workers.find(w => w.type === savedWorker.type); if (workerConfig && typeof savedWorker.enabled === 'boolean') { workerConfig.enabled = savedWorker.enabled; } } } // Restore resourceThresholds, maxConcurrent, workerTimeoutMs from saved state // Only restore if valid numeric values within sane ranges if (saved.config?.resourceThresholds && !this.originalConfig?.resourceThresholds) { const rt = saved.config.resourceThresholds; if (typeof rt.maxCpuLoad === 'number' && rt.maxCpuLoad > 0 && rt.maxCpuLoad < 1000) { this.config.resourceThresholds.maxCpuLoad = rt.maxCpuLoad; } if (typeof rt.minFreeMemoryPercent === 'number' && rt.minFreeMemoryPercent >= 0 && rt.minFreeMemoryPercent <= 100) { this.config.resourceThresholds.minFreeMemoryPercent = rt.minFreeMemoryPercent; } } if (typeof saved.config?.maxConcurrent === 'number' && saved.config.maxConcurrent > 0) { this.config.maxConcurrent = saved.config.maxConcurrent; } if (typeof saved.config?.workerTimeoutMs === 'number' && saved.config.workerTimeoutMs > 0) { this.config.workerTimeoutMs = saved.config.workerTimeoutMs; } // Restore worker runtime states (runCount, successCount, etc.) if (saved.workers) { for (const [type, state] of Object.entries(saved.workers)) { const savedState = state; const lastRunValue = savedState.lastRun; const lastStartedAtValue = savedState.lastStartedAt; this.workers.set(type, { runCount: savedState.runCount || 0, successCount: savedState.successCount || 0, failureCount: savedState.failureCount || 0, averageDurationMs: savedState.averageDurationMs || 0, lastRun: lastRunValue ? new Date(lastRunValue) : undefined, lastStartedAt: lastStartedAtValue ? new Date(lastStartedAtValue) : undefined, nextRun: undefined, isRunning: false, }); } } } catch { // Ignore parse errors, start fresh } } // Initialize any missing workers for (const workerConfig of this.config.workers) { if (!this.workers.has(workerConfig.type)) { this.workers.set(workerConfig.type, { runCount: 0, successCount: 0, failureCount: 0, averageDurationMs: 0, isRunning: false, }); } } } /** * Get the PID file path for singleton enforcement (#1395 Bug 3). */ get pidFile() { return join(this.projectRoot, '.claude-flow', 'daemon.pid'); } /** * Check if another daemon instance is already running. * Returns the existing PID if alive, or null if no daemon is running. * * #1853: ignore self-PID matches. The detached-spawn path in * `commands/daemon.ts` writes the child's PID into the file as a * fallback after a 500ms wait. If the child reaches `start()` slower * than the parent's 500ms wait (observed on Node 25 / macOS 26), the * child reads its own PID back from the file and concludes "another * daemon is already running" — so it exits before scheduling workers * and `daemon status` reports STOPPED forever. A daemon process is * never "another instance" of itself; treat self-match as absence. */ checkExistingDaemon() { if (!existsSync(this.pidFile)) return null; try { const pid = parseInt(readFileSync(this.pidFile, 'utf-8').trim(), 10); if (isNaN(pid)) return null; // #1853: a PID file containing our own PID is not "another daemon". // Treat as absent so the start() path proceeds normally. if (pid === process.pid) return null; // Check if process is alive (signal 0 = existence check) process.kill(pid, 0); return pid; // Process is alive } catch { // Process is dead — clean up stale PID file try { unlinkSync(this.pidFile); } catch { /* ignore */ } return null; } } /** * Write PID file for singleton enforcement. */ writePidFile() { const dir = join(this.projectRoot, '.claude-flow'); if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); writeFileSync(this.pidFile, String(process.pid), 'utf-8'); } /** * Remove PID file on shutdown. */ removePidFile() { try { unlinkSync(this.pidFile); } catch { /* ignore */ } } /** * Start the daemon and all enabled workers */ async start() { if (this.running) { this.emit('warning', 'Daemon already running'); return; } // PID singleton enforcement (#1395 Bug 3): prevent daemon accumulation const existingPid = this.checkExistingDaemon(); if (existingPid !== null) { this.log('info', `Daemon already running (PID: ${existingPid}), skipping start`); this.emit('warning', `Daemon already running (PID: ${existingPid})`); return; } // #1855: reap orphan headless worker children left by a previous // crashed lifetime, BEFORE we mark ourselves running and start // accepting new work. The children file from the prior daemon's // last-snapshot is the authoritative list. this.reapOrphanedChildren(); // #1856: detect workers that were mid-flight at the previous crash // and count them as failures so runCount/successCount/failureCount // stay consistent. Workers retry naturally at their next scheduled // interval — we don't immediately re-run them, which avoids a // freshly-recovered daemon hammering the same code path that just // killed it. this.detectMidFlightFailures(); this.running = true; this.startedAt = new Date(); this.writePidFile(); this.emit('started', { pid: process.pid, startedAt: this.startedAt }); // Schedule all enabled workers for (const workerConfig of this.config.workers) { if (workerConfig.enabled) { this.scheduleWorker(workerConfig); } } // #1845: poll the MCP-dispatch queue directory so workers requested // via mcp__hooks_worker-dispatch (in a separate process) actually // execute here. Previously the dispatch wrote to a process-local Map // that the daemon could never see. this.queuePollTimer = setInterval(() => { void this.processDispatchQueue(); }, 5_000); if (typeof this.queuePollTimer.unref === 'function') { this.queuePollTimer.unref(); } // Save state this.saveState(); this.log('info', `Daemon started (PID: ${process.pid}, CPUs: ${cpus().length}, workers: ${this.config.workers.filter(w => w.enabled).length}, maxCpuLoad: ${this.config.resourceThresholds.maxCpuLoad}, minFreeMemoryPercent: ${this.config.resourceThresholds.minFreeMemoryPercent}%)`); } /** * #1845: ingest queue entries written by mcp__hooks_worker-dispatch. * Each entry is a JSON file at `.claude-flow/daemon-queue/<id>.json` * with `{ workerId, trigger, context, enqueuedAt }`. We move processed * files to `.claude-flow/daemon-queue/.processed/` so the daemon never * re-runs the same dispatch and operators can inspect history. */ async processDispatchQueue() { if (!this.running) return; const queueDir = join(this.projectRoot, '.claude-flow', 'daemon-queue'); if (!existsSync(queueDir)) return; let entries; try { const fs = await import('fs'); entries = fs.readdirSync(queueDir).filter((n) => n.endsWith('.json')); } catch { return; } if (entries.length === 0) return; const fs = await import('fs'); const processedDir = join(queueDir, '.processed'); if (!existsSync(processedDir)) { try { fs.mkdirSync(processedDir, { recursive: true }); } catch { /* race ok */ } } for (const entry of entries) { const src = join(queueDir, entry); // eslint-disable-next-line @typescript-eslint/no-explicit-any let payload; try { payload = JSON.parse(fs.readFileSync(src, 'utf-8')); } catch { // Malformed entry — quarantine so we don't loop on it try { fs.renameSync(src, join(processedDir, `bad-${entry}`)); } catch { /* nothing more we can do */ } continue; } const trigger = payload?.trigger; const workerId = payload?.workerId; if (!trigger || !this.config.workers.some((w) => w.type === trigger)) { try { fs.renameSync(src, join(processedDir, `unknown-${entry}`)); } catch { /* ok */ } continue; } try { this.log('info', `Dequeued ${trigger}${workerId ? ` (id=${workerId})` : ''} from MCP dispatch queue`); await this.triggerWorker(trigger); } catch (err) { this.log('warn', `Queued worker ${trigger} failed: ${err.message}`); } finally { try { fs.renameSync(src, join(processedDir, entry)); } catch { /* ignore */ } } } } /** * Stop the daemon and all workers */ async stop() { if (!this.running) { this.emit('warning', 'Daemon not running'); return; } // Clear all timers (convert to array to avoid iterator issues) const timerEntries = Array.from(this.timers.entries()); for (const [type, timer] of timerEntries) { clearTimeout(timer); this.log('info', `Stopped worker: ${type}`); } this.timers.clear(); // #1845: stop the MCP-dispatch queue poller too. if (this.queuePollTimer) { clearInterval(this.queuePollTimer); this.queuePollTimer = undefined; } this.running = false; this.removePidFile(); this.saveState(); this.emit('stopped', { stoppedAt: new Date() }); this.log('info', 'Daemon stopped'); } /** * Get daemon status */ getStatus() { return { running: this.running, pid: process.pid, startedAt: this.startedAt, workers: new Map(this.workers), config: this.config, }; } /** * Schedule a worker to run at intervals with staggered start */ scheduleWorker(workerConfig) { const state = this.workers.get(workerConfig.type); const internalConfig = workerConfig; const staggerOffset = internalConfig.offsetMs || 0; // Calculate initial delay with stagger offset let initialDelay = staggerOffset; if (state.lastRun) { const timeSinceLastRun = Date.now() - state.lastRun.getTime(); initialDelay = Math.max(staggerOffset, workerConfig.intervalMs - timeSinceLastRun); } state.nextRun = new Date(Date.now() + initialDelay); const runAndReschedule = async () => { if (!this.running) return; // Use concurrency-controlled execution (P0 fix) await this.executeWorkerWithConcurrencyControl(workerConfig); // Reschedule if (this.running) { const timer = setTimeout(runAndReschedule, workerConfig.intervalMs); this.timers.set(workerConfig.type, timer); state.nextRun = new Date(Date.now() + workerConfig.intervalMs); } }; // Schedule first run with stagger offset const timer = setTimeout(runAndReschedule, initialDelay); this.timers.set(workerConfig.type, timer); this.log('info', `Scheduled ${workerConfig.type} (interval: ${workerConfig.intervalMs / 1000}s, first run in ${initialDelay / 1000}s)`); } /** * Execute a worker with concurrency control (P0 fix) */ async executeWorkerWithConcurrencyControl(workerConfig) { // Check concurrency limit if (this.runningWorkers.size >= this.config.maxConcurrent) { this.log('info', `Worker ${workerConfig.type} deferred: max concurrent (${this.config.maxConcurrent}) reached`); this.pendingWorkers.push(workerConfig.type); this.emit('worker:deferred', { type: workerConfig.type, reason: 'max_concurrent' }); return null; } // Check resource availability const resourceCheck = await this.canRunWorker(); if (!resourceCheck.allowed) { this.log('info', `Worker ${workerConfig.type} deferred: ${resourceCheck.reason}`); this.pendingWorkers.push(workerConfig.type); this.emit('worker:deferred', { type: workerConfig.type, reason: resourceCheck.reason }); return null; } return this.executeWorker(workerConfig); } /** * Execute a worker with timeout protection */ async executeWorker(workerConfig) { const state = this.workers.get(workerConfig.type); const workerId = `${workerConfig.type}_${Date.now()}`; const startTime = Date.now(); // Track running worker this.runningWorkers.add(workerConfig.type); state.isRunning = true; state.lastStartedAt = new Date(); // #1856: timestamp the start this.saveState(); // persist before we run anything this.emit('worker:start', { workerId, type: workerConfig.type }); this.log('info', `Starting worker: ${workerConfig.type} (${this.runningWorkers.size}/${this.config.maxConcurrent} concurrent)`); try { // Execute worker logic with timeout (P1 fix) // Pass cleanup callback to kill orphan child processes on timeout (#1117) const output = await this.runWithTimeout(() => this.runWorkerLogic(workerConfig), this.config.workerTimeoutMs, `Worker ${workerConfig.type} timed out after ${this.config.workerTimeoutMs / 1000}s`, () => { // On timeout, cancel any headless execution to prevent orphan processes if (this.headlessExecutor) { this.headlessExecutor.cancelAll(); } }); const durationMs = Date.now() - startTime; // Update state state.runCount++; state.successCount++; state.lastRun = new Date(); state.averageDurationMs = (state.averageDurationMs * (state.runCount - 1) + durationMs) / state.runCount; state.isRunning = false; const result = { workerId, type: workerConfig.type, success: true, durationMs, output, timestamp: new Date(), }; this.emit('worker:complete', result); this.log('info', `Worker ${workerConfig.type} completed in ${durationMs}ms`); this.saveState(); return result; } catch (error) { const durationMs = Date.now() - startTime; state.runCount++; state.failureCount++; state.lastRun = new Date(); state.isRunning = false; const result = { workerId, type: workerConfig.type, success: false, durationMs, error: error instanceof Error ? error.message : String(error), timestamp: new Date(), }; this.emit('worker:error', result); this.log('error', `Worker ${workerConfig.type} failed: ${result.error}`); this.saveState(); return result; } finally { // Remove from running set and process queue this.runningWorkers.delete(workerConfig.type); this.processPendingWorkers(); } } /** * Run a function with timeout (P1 fix) * @param fn - The async function to execute * @param timeoutMs - Timeout in milliseconds * @param timeoutMessage - Error message on timeout * @param onTimeout - Optional cleanup callback invoked when timeout fires (#1117: kills orphan processes) */ async runWithTimeout(fn, timeoutMs, timeoutMessage, onTimeout) { return new Promise((resolve, reject) => { let settled = false; const timer = setTimeout(() => { if (settled) return; settled = true; // Kill orphan child processes before rejecting (#1117) if (onTimeout) { try { onTimeout(); } catch { // Ignore cleanup errors } } reject(new Error(timeoutMessage)); }, timeoutMs); fn() .then((result) => { if (settled) return; settled = true; clearTimeout(timer); resolve(result); }) .catch((error) => { if (settled) return; settled = true; clearTimeout(timer); reject(error); }); }); } /** * Run the actual worker logic */ async runWorkerLogic(workerConfig) { // Check if this is a headless worker type and headless execution is available if (isHeadlessWorker(workerConfig.type) && this.headlessAvailable && this.headlessExecutor) { try { this.log('info', `Running ${workerConfig.type} in headless mode (Claude Code AI)`); const result = await this.headlessExecutor.execute(workerConfig.type); // #2110 — `HeadlessWorkerExecutor.execute()` returns // `createErrorResult(...)` with `success: false` when // `isAvailable()` is false, instead of throwing. The previous // try/catch never fired in that path, and the result was // persisted as mode:"headless" despite being a stub. Downstream // dashboards / `memory stats` couldn't distinguish a real AI // run from a fallback. Treat falsy success the same as throw. const ok = result?.success === true; if (!ok) { const reason = result?.error || result?.note || 'headless executor reported success=false'; this.log('warn', `Headless ${workerConfig.type} returned success=false (${String(reason).slice(0, 200)}); falling back to local mode`); this.emit('headless:fallback', { type: workerConfig.type, error: String(reason).slice(0, 500), }); // Fall through to local switch. } else { // #1793: persist the headless result to the same metrics files the // local workers write to. Without this, AI-mode runs produced rich // parsedOutput that lived only in `.claude-flow/logs/headless/*` and // never reached `.claude-flow/metrics/<name>.json` — `memory stats` // and downstream consumers saw nothing despite successful runs. try { this.persistHeadlessResult(workerConfig.type, result); } catch (persistError) { this.log('warn', `Failed to persist headless result for ${workerConfig.type}: ${persistError.message}`); } return { mode: 'headless', ...result, }; } } catch (error) { this.log('warn', `Headless execution failed for ${workerConfig.type}, falling back to local mode`); this.emit('headless:fallback', { type: workerConfig.type, error: error instanceof Error ? error.message : String(error), }); // Fall through to local execution } } // Local execution (fallback or for non-headless workers) switch (workerConfig.type) { case 'map': return this.runMapWorker(); case 'audit': return this.runAuditWorkerLocal(); case 'optimize': return this.runOptimizeWorkerLocal(); case 'consolidate': return this.runConsolidateWorker(); case 'testgaps': return this.runTestGapsWorkerLocal(); case 'predict': return this.runPredictWorkerLocal(); case 'document': return this.runDocumentWorkerLocal(); case 'ultralearn': return this.runUltralearnWorkerLocal(); case 'refactor': return this.runRefactorWorkerLocal(); case 'deepdive': return this.runDeepdiveWorkerLocal(); case 'benchmark': return this.runBenchmarkWorkerLocal(); case 'preload': return this.runPreloadWorkerLocal(); default: return { status: 'unknown worker type', mode: 'local' }; } } /** * #1793: persist a headless worker result to the same metrics file the * local fallback writes to. Without this, AI-mode workers produced rich * structured output (audit findings, perf signals, test-gap analysis) * that lived only in `.claude-flow/logs/headless/*_result.log` and was * invisible to `npx ruflo memory stats` or the metrics consumers. * * The mapping mirrors the `*Local` worker implementations below so a * single consumer path works regardless of execution mode. */ persistHeadlessResult(workerType, result) { const metricsDir = join(this.projectRoot, '.claude-flow', 'metrics'); if (!existsSync(metricsDir)) mkdirSync(metricsDir, { recursive: true }); // Filename mirrors the local-mode worker writes (security-audit.json, // performance.json, test-gaps.json) so a downstream reader doesn't // care which mode produced the data. const filenameMap = { audit: 'security-audit.json', optimize: 'performance.json', testgaps: 'test-gaps.json', document: 'documentation.json', refactor: 'refactor.json', deepdive: 'deepdive.json', ultralearn: 'ultralearn.json', predict: 'predictions.json', }; const filename = filenameMap[workerType] ?? `${workerType}.json`; const metricsFile = join(metricsDir, filename); const persisted = { timestamp: result.timestamp instanceof Date ? result.timestamp.toISOString() : new Date().toISOString(), mode: 'headless', workerType, model: result.model, durationMs: result.durationMs, tokensUsed: result.tokensUsed, executionId: result.executionId, success: result.success, // Structured findings live here when the worker emits JSON (e.g. the // audit worker's vulnerability list). Fall back to a raw-output // pointer so consumers can still locate the full log. findings: result.parsedOutput ?? null, rawOutputPreview: typeof result.output === 'string' ? result.output.slice(0, 2000) : undefined, rawOutputLength: typeof result.output === 'string' ? result.output.length : 0, }; writeFileSync(metricsFile, JSON.stringify(persisted, null, 2)); } // Worker implementations async runMapWorker() { // Scan project structure and update metrics const metricsFile = join(this.projectRoot, '.claude-flow', 'metrics', 'codebase-map.json'); const metricsDir = join(this.projectRoot, '.claude-flow', 'metrics'); if (!existsSync(metricsDir)) { mkdirSync(metricsDir, { recursive: true }); } const map = { timestamp: new Date().toISOString(), projectRoot: this.projectRoot, structure: { hasPackageJson: existsSync(join(this.projectRoot, 'package.json')), hasTsConfig: existsSync(join(this.projectRoot, 'tsconfig.json')), hasClaudeConfig: existsSync(join(this.projectRoot, '.claude')), hasClaudeFlow: existsSync(join(this.projec