@debugg-ai/debugg-ai-mcp
Version:
Zero-Config, Fully AI-Managed End-to-End Testing for all code gen platforms.
525 lines (524 loc) • 25.2 kB
JavaScript
/**
* Tunnel Management Service
*
* Manages per-port ngrok tunnels with two layers of reuse:
*
* 1. Within-process — activeTunnels map, 55-min auto-shutoff timer.
* 2. Cross-process — file-backed RegistryStore so a second MCP instance
* on the same machine borrows an existing tunnel instead
* of provisioning a new one for the same port.
*
* Lifecycle:
* - Owned tunnels (isOwned=true) : this process created them; it disconnects
* and revokes the key on stop.
* - Borrowed tunnels (isOwned=false): another process owns them; on stop we
* only remove the local reference.
* - Auto-shutoff timer checks the shared registry before firing: if another
* process recently touched the entry the timer resets instead of stopping.
*/
import { Logger } from '../../utils/logger.js';
import { Telemetry, TelemetryEvents } from '../../utils/telemetry.js';
import { isLocalhostUrl, extractLocalhostPort, generateTunnelUrl } from '../../utils/urlParser.js';
import { v4 as uuidv4 } from 'uuid';
import { FaultInjector, TunnelTrace, getFaultModeFromEnv } from './tunnelFaultInjection.js';
import { getDefaultRegistry, } from './tunnelRegistry.js';
let ngrokModule = null;
async function getNgrok() {
if (!ngrokModule) {
try {
ngrokModule = await import('ngrok');
}
catch (error) {
throw new Error(`Failed to load ngrok module: ${error}`);
}
}
return ngrokModule;
}
/**
* Reset the cached ngrok module so the next connect() bootstraps a fresh agent.
* Called when the last owned tunnel is disconnected and the agent process may have died.
*/
function resetNgrokModule() {
ngrokModule = null;
}
const logger = new Logger({ module: 'tunnelManager' });
// ── TunnelManager ─────────────────────────────────────────────────────────────
class TunnelManager {
reg;
activeTunnels = new Map();
pendingTunnels = new Map();
initialized = false;
TUNNEL_TIMEOUT_MS = 55 * 60 * 1000;
/**
* Bead `3th`: registry-entry freshness window. An entry not touched within
* this many ms is treated as stale even if its owner PID is alive — defends
* against PID-reuse (OS reassigns dead-owner's PID to a different process).
*/
REGISTRY_FRESHNESS_TTL_MS = 30 * 60 * 1000;
/**
* Bead `mdp`: prune-on-startup eviction window. Entries older than this OR
* with dead owner PID get swept out when TunnelManager initializes.
*/
REGISTRY_PRUNE_THRESHOLD_MS = 60 * 60 * 1000;
/**
* Backoff schedule (ms) between ngrok.connect() retry attempts. Bead ixh.
* Exposed on the class so tests can override with short delays without
* changing the public API or depending on jest fake timers.
*/
connectBackoffMs = [500, 1500];
constructor(reg = getDefaultRegistry()) {
this.reg = reg;
// Bead `mdp`: sweep stale entries on startup so the registry doesn't grow
// unboundedly across MCP processes that exited without stopAllTunnels
// (SIGKILL / crash). Best-effort — no-op registries don't actually prune.
try {
const result = this.reg.prune({ staleAfterMs: this.REGISTRY_PRUNE_THRESHOLD_MS });
if (result.pruned > 0) {
logger.info(`Pruned ${result.pruned} stale registry entries on startup (${result.remaining} remaining)`);
}
}
catch (err) {
logger.warn(`Registry prune-on-startup failed (non-fatal): ${err}`);
}
}
/**
* Bead `3th`: freshness check used at borrow sites. Returns true if the
* entry is BOTH owner-alive AND touched recently enough to trust.
*/
isEntryUsable(entry, nowMs = Date.now()) {
return (this.reg.isPidAlive(entry.ownerPid) &&
(nowMs - entry.lastAccessedAt) <= this.REGISTRY_FRESHNESS_TTL_MS);
}
// ── Public API ──────────────────────────────────────────────────────────────
async processUrl(url, authToken, specificTunnelId, keyId, revokeKey) {
if (!isLocalhostUrl(url)) {
return { url, isLocalhost: false };
}
const port = extractLocalhostPort(url);
if (!port) {
throw new Error(`Could not extract port from localhost URL: ${url}`);
}
if (!authToken) {
throw new Error('Auth token required to create tunnel for localhost URL');
}
const tunnelId = specificTunnelId || uuidv4();
return this.processPerPort(url, port, authToken, tunnelId, keyId, revokeKey);
}
/**
* Return an active tunnel for the given local port, or undefined.
* For borrowed tunnels, evicts the entry if the owning process has died.
*/
getTunnelForPort(port) {
const existing = this.findTunnelByPort(port);
if (!existing)
return undefined;
if (!existing.isOwned) {
// Verify the owning process is still alive AND the entry is fresh
// (lastAccessedAt within REGISTRY_FRESHNESS_TTL_MS — defends against
// PID-reuse per bead 3th).
const entry = this.reg.read()[String(port)];
if (!entry || !this.isEntryUsable(entry)) {
this.activeTunnels.delete(existing.tunnelId);
const reason = !entry
? 'no registry entry'
: !this.reg.isPidAlive(entry.ownerPid)
? `owner PID ${entry.ownerPid} dead`
: `entry stale (last accessed ${Math.round((Date.now() - entry.lastAccessedAt) / 1000)}s ago)`;
logger.info(`Evicted stale borrowed tunnel ${existing.tunnelId} (${reason})`);
return undefined;
}
}
return existing;
}
touchTunnel(tunnelId) {
const tunnelInfo = this.activeTunnels.get(tunnelId);
if (!tunnelInfo)
return;
// Refresh the shared registry entry so the owning process won't auto-shutoff
// while we're actively using the tunnel (even if we're borrowing it).
try {
const registry = this.reg.read();
const entry = registry[String(tunnelInfo.port)];
if (entry) {
entry.lastAccessedAt = Date.now();
this.reg.write(registry);
}
}
catch {
// best-effort
}
this.resetTunnelTimer(tunnelInfo);
}
touchTunnelByUrl(url) {
const tunnelId = this.extractTunnelId(url);
if (tunnelId) {
this.touchTunnel(tunnelId);
}
}
isTunnelUrl(url) {
return url.includes('.ngrok.debugg.ai');
}
extractTunnelId(url) {
const match = url.match(/https?:\/\/([^.]+)\.ngrok\.debugg\.ai/);
return match ? match[1] : null;
}
getTunnelInfo(tunnelId) {
return this.activeTunnels.get(tunnelId);
}
getActiveTunnels() {
return Array.from(this.activeTunnels.values());
}
async stopTunnel(tunnelId) {
const tunnelInfo = this.activeTunnels.get(tunnelId);
if (!tunnelInfo) {
logger.warn(`Tunnel ${tunnelId} not found for cleanup`);
return;
}
if (tunnelInfo.autoShutoffTimer) {
clearTimeout(tunnelInfo.autoShutoffTimer);
}
this.activeTunnels.delete(tunnelId);
if (!tunnelInfo.isOwned) {
// Borrowed — just drop the local reference; owner manages the real tunnel
logger.info(`Released borrowed tunnel reference: ${tunnelInfo.publicUrl}`);
Telemetry.capture(TelemetryEvents.TUNNEL_STOPPED, { port: tunnelInfo.port, reason: 'released', isOwned: false });
return;
}
// Owned — remove from shared registry, then disconnect + revoke
try {
const registry = this.reg.read();
delete registry[String(tunnelInfo.port)];
this.reg.write(registry);
}
catch {
// best-effort
}
try {
const ngrok = await getNgrok();
await ngrok.disconnect(tunnelInfo.tunnelUrl);
logger.info(`Cleaned up tunnel: ${tunnelInfo.publicUrl}`);
}
catch (error) {
logger.warn(`ngrok.disconnect failed for tunnel ${tunnelId} (already cleaned up):`, error);
}
// If no owned tunnels remain, the ngrok agent process may have exited.
// Reset module + init state so the next connect() bootstraps a fresh agent.
const hasOwnedTunnels = Array.from(this.activeTunnels.values()).some(t => t.isOwned);
if (!hasOwnedTunnels) {
logger.info('No owned tunnels remain — resetting ngrok module for fresh init on next request');
resetNgrokModule();
this.initialized = false;
}
if (tunnelInfo.revokeKey) {
tunnelInfo.revokeKey().catch((err) => logger.warn(`Failed to revoke key for tunnel ${tunnelId}:`, err));
}
}
async stopAllTunnels() {
const ids = Array.from(this.activeTunnels.keys());
await Promise.all(ids.map((id) => this.stopTunnel(id).catch((err) => logger.error(`Failed to stop tunnel ${id}:`, err))));
logger.info(`Stopped ${ids.length} tunnel(s)`);
}
getTunnelStatus(tunnelId) {
const tunnel = this.activeTunnels.get(tunnelId);
if (!tunnel)
return null;
const now = Date.now();
return {
tunnel,
age: now - tunnel.createdAt,
timeSinceLastAccess: now - tunnel.lastAccessedAt,
timeUntilAutoShutoff: Math.max(0, tunnel.lastAccessedAt + this.TUNNEL_TIMEOUT_MS - now),
};
}
getAllTunnelStatuses() {
const statuses = [];
for (const tunnelId of this.activeTunnels.keys()) {
const status = this.getTunnelStatus(tunnelId);
if (status)
statuses.push(status);
}
return statuses;
}
// ── Per-port tunnel ─────────────────────────────────────────────────────────
async processPerPort(url, port, authToken, tunnelId, keyId, revokeKey) {
// 1. Check local in-process map (handles owned + borrowed with liveness check)
const existing = this.getTunnelForPort(port);
if (existing) {
logger.info(`Reusing existing tunnel for port ${port}: ${existing.publicUrl}`);
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISIONED, { port, how: 'reused' });
return { url: existing.publicUrl, tunnelId: existing.tunnelId, isLocalhost: true };
}
// 2. Deduplicate concurrent creation requests for the same port
const pending = this.pendingTunnels.get(port);
if (pending) {
// Bead 7qh Finding 2: our minted tunnelKey/keyId are now redundant — the
// in-flight call owns the tunnel for this port. Revoke our key up-front
// so it doesn't orphan on the backend. Failures are swallowed: we can't
// let cleanup break the join.
if (revokeKey) {
revokeKey().catch((err) => logger.warn(`Failed to revoke redundant key while joining pending tunnel for port ${port}:`, err));
}
const info = await pending;
return { url: info.publicUrl, tunnelId: info.tunnelId, isLocalhost: true };
}
// 3. Check cross-process registry — another MCP instance may own a tunnel.
// Borrow only if the entry is fresh (PID alive AND touched within
// REGISTRY_FRESHNESS_TTL_MS — defends against PID-reuse, bead 3th).
const registry = this.reg.read();
const regEntry = registry[String(port)];
if (regEntry && this.isEntryUsable(regEntry)) {
logger.info(`Borrowing tunnel from PID ${regEntry.ownerPid} for port ${port}: ${regEntry.publicUrl}`);
const now = Date.now();
const borrowed = {
tunnelId: regEntry.tunnelId,
originalUrl: url,
tunnelUrl: regEntry.tunnelUrl,
publicUrl: regEntry.publicUrl,
port,
createdAt: now,
lastAccessedAt: now,
isOwned: false,
};
this.activeTunnels.set(regEntry.tunnelId, borrowed);
// Touch registry so the owner knows not to auto-shutoff
regEntry.lastAccessedAt = now;
this.reg.write(registry);
this.resetTunnelTimer(borrowed);
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISIONED, { port, how: 'borrowed' });
return { url: regEntry.publicUrl, tunnelId: regEntry.tunnelId, isLocalhost: true };
}
// 4. Create a new tunnel (this process becomes the owner)
const creationPromise = this.createTunnel(url, port, tunnelId, authToken, keyId, revokeKey);
this.pendingTunnels.set(port, creationPromise);
let tunnelInfo;
try {
tunnelInfo = await creationPromise;
}
finally {
this.pendingTunnels.delete(port);
}
return { url: tunnelInfo.publicUrl, tunnelId: tunnelInfo.tunnelId, isLocalhost: true };
}
findTunnelByPort(port) {
for (const tunnel of this.activeTunnels.values()) {
if (tunnel.port === port)
return tunnel;
}
return undefined;
}
async createTunnel(originalUrl, port, tunnelId, authToken, keyId, revokeKey) {
await this.ensureInitialized();
const tunnelDomain = `${tunnelId}.ngrok.debugg.ai`;
logger.info(`Creating tunnel for localhost:${port} (domain: ${tunnelDomain})`);
const isHttpsLocal = originalUrl.startsWith('https:');
const inDocker = process.env.DOCKER_CONTAINER === 'true';
const dockerHost = 'host.docker.internal';
// Bead fhg: force IPv4 loopback when running against localhost. ngrok's
// default resolution of a bare port or "localhost" can pick IPv6 [::1]
// first on macOS/modern OSes, but most dev servers (Next.js, Vite) bind
// only to 127.0.0.1 — resulting in ngrok connect:refused + ERR_NGROK_8012
// on the browser side with no actionable error back to the MCP caller.
let localAddr;
if (isHttpsLocal) {
localAddr = inDocker ? `https://${dockerHost}:${port}` : `https://localhost:${port}`;
}
else {
localAddr = inDocker ? `${dockerHost}:${port}` : `127.0.0.1:${port}`;
}
// Bead ixh: 3-attempt retry for ngrok.connect transient failures. Previously
// only retried ONCE (with agent reset), which is insufficient against real
// ngrok / network flakes (client-reported incident 2026-04-24).
// - Attempt 1: fresh connect
// - Attempt 2: after 500ms backoff, reset the ngrok agent module and retry
// (existing "agent died" recovery path)
// - Attempt 3: after 1500ms backoff, retry with the already-reset agent
// Auth-token errors short-circuit at any attempt — no point looping.
// Bead 42g: fault injection + trace. Only active when NODE_ENV !== 'production'
// AND DEBUGG_TUNNEL_FAULT_MODE env var is set. Zero overhead when disabled.
const faultMode = getFaultModeFromEnv();
const faults = new FaultInjector(faultMode);
const trace = new TunnelTrace();
trace.emit('createTunnel.start', { port, tunnelId, hasFaultMode: !!faultMode });
const connectWithRetry = async () => {
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
const BACKOFF_MS = this.connectBackoffMs; // bead ixh: test-overridable
const MAX_ATTEMPTS = BACKOFF_MS.length + 1; // N sleeps between N+1 attempts
const connectOpts = {
proto: 'http',
addr: localAddr,
hostname: tunnelDomain,
authtoken: authToken,
};
let lastError;
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
trace.emit('connect.attempt.start', { attempt });
// Optional fault-injected delay before each attempt.
const delayMs = faults.delayMsForAttempt();
if (delayMs > 0) {
trace.emit('connect.fault.delay', { attempt, delayMs });
await sleep(delayMs);
}
try {
const ngrok = await getNgrok();
// Fault-inject a synthetic failure BEFORE ngrok.connect runs so we
// can simulate connect-layer failures without hitting the real API.
if (faults.shouldFailConnect()) {
trace.emit('connect.fault.inject', { attempt, mode: 'fail-connect-N' });
throw new Error(`[fault-inject] synthetic connect failure (attempt ${attempt})`);
}
const url = faults.shouldReturnEmptyUrl() ? '' : await ngrok.connect(connectOpts);
if (!url) {
trace.emit('connect.attempt.empty-url', { attempt });
throw new Error(`ngrok.connect() returned empty URL (attempt ${attempt})`);
}
trace.emit('connect.attempt.success', { attempt });
if (attempt > 1) {
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
attempt,
outcome: 'success',
stage: 'ngrok_connect',
});
}
return url;
}
catch (err) {
lastError = err;
const msg = err instanceof Error ? err.message : String(err);
trace.emit('connect.attempt.fail', { attempt, message: msg.slice(0, 200) });
// Auth-class errors are non-retryable — retrying with the same token
// would loop. Let the outer catch classify the message.
if (/authtoken|unauthorized|\b401\b|\b403\b/i.test(msg)) {
trace.emit('connect.giving-up', { reason: 'auth-error' });
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
attempt,
outcome: 'giving-up',
stage: 'ngrok_connect',
reason: 'auth-error',
});
throw err;
}
const isLastAttempt = attempt >= MAX_ATTEMPTS;
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
attempt,
outcome: isLastAttempt ? 'giving-up' : 'will-retry',
stage: 'ngrok_connect',
});
if (isLastAttempt) {
trace.emit('connect.giving-up', { reason: 'max-attempts' });
throw err;
}
// Between attempt 1→2, do an agent-reset (covers the "agent died"
// failure mode that used to be the only retried case). Between 2→3,
// just wait — the reset already happened.
if (attempt === 1) {
logger.warn(`ngrok.connect() failed (attempt 1/${MAX_ATTEMPTS}), resetting agent: ${msg}`);
trace.emit('agent.reset');
resetNgrokModule();
this.initialized = false;
await this.ensureInitialized();
}
else {
logger.warn(`ngrok.connect() failed (attempt ${attempt}/${MAX_ATTEMPTS}), will retry: ${msg}`);
}
const backoffMs = BACKOFF_MS[attempt - 1] ?? BACKOFF_MS[BACKOFF_MS.length - 1];
trace.emit('connect.backoff', { attempt, backoffMs });
await sleep(backoffMs);
}
}
// Unreachable (loop always returns or throws), but satisfy TS
throw lastError ?? new Error('connectWithRetry: exhausted attempts without error');
};
try {
const tunnelUrl = await connectWithRetry();
const publicUrl = generateTunnelUrl(originalUrl, tunnelId);
const now = Date.now();
const tunnelInfo = {
tunnelId,
originalUrl,
tunnelUrl,
publicUrl,
port,
createdAt: now,
lastAccessedAt: now,
isOwned: true,
keyId,
revokeKey,
};
this.activeTunnels.set(tunnelId, tunnelInfo);
// Register in shared cross-process registry
try {
const registry = this.reg.read();
registry[String(port)] = {
tunnelId,
publicUrl,
tunnelUrl,
port,
ownerPid: process.pid,
lastAccessedAt: now,
};
this.reg.write(registry);
}
catch {
// best-effort
}
this.resetTunnelTimer(tunnelInfo);
trace.emit('createTunnel.success', { tunnelId, publicUrl });
logger.info(`Tunnel created: ${publicUrl} → localhost:${port}`);
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISIONED, { port, how: 'created' });
return tunnelInfo;
}
catch (error) {
const msg = error instanceof Error ? error.message : 'Unknown error';
trace.emit('createTunnel.fail', { message: msg.slice(0, 200) });
// Bead 42g: when the trace captured meaningful timing info, log it at
// WARN so operators can post-mortem. Keeping it out of the thrown error
// text so we don't leak internals to users.
logger.warn(`Tunnel lifecycle trace (fail path):\n${trace.format()}`);
if (msg.includes('authtoken')) {
throw new Error(`Failed to create tunnel: invalid auth token. ${msg}`);
}
throw new Error(`Failed to create tunnel: ${msg}`);
}
}
// ── Helpers ─────────────────────────────────────────────────────────────────
async ensureInitialized() {
if (!this.initialized) {
try {
const ngrok = await getNgrok();
ngrok.getApi();
}
catch {
// ignore — let connect surface real errors
}
this.initialized = true;
}
}
resetTunnelTimer(tunnelInfo) {
if (tunnelInfo.autoShutoffTimer)
clearTimeout(tunnelInfo.autoShutoffTimer);
tunnelInfo.lastAccessedAt = Date.now();
tunnelInfo.autoShutoffTimer = setTimeout(async () => {
// For owned tunnels: if another process recently touched the registry entry,
// reset the timer rather than disconnecting — that process is still using it.
if (tunnelInfo.isOwned) {
try {
const entry = this.reg.read()[String(tunnelInfo.port)];
if (entry && Date.now() - entry.lastAccessedAt < this.TUNNEL_TIMEOUT_MS) {
logger.info(`Tunnel ${tunnelInfo.tunnelId} accessed by another process — extending lifetime`);
this.resetTunnelTimer(tunnelInfo);
return;
}
}
catch {
// best-effort; proceed with shutoff
}
}
logger.info(`Auto-shutting down tunnel ${tunnelInfo.tunnelId} after inactivity`);
Telemetry.capture(TelemetryEvents.TUNNEL_STOPPED, { port: tunnelInfo.port, reason: 'auto-shutoff', isOwned: tunnelInfo.isOwned });
await this.stopTunnel(tunnelInfo.tunnelId).catch((err) => logger.error(`Failed to auto-shutdown tunnel ${tunnelInfo.tunnelId}:`, err));
}, this.TUNNEL_TIMEOUT_MS);
}
}
const tunnelManager = new TunnelManager();
export { tunnelManager };
export default TunnelManager;