UNPKG

@debugg-ai/debugg-ai-mcp

Version:

Zero-Config, Fully AI-Managed End-to-End Testing for all code gen platforms.

586 lines (585 loc) 31.2 kB
/** * Test Page Changes Handler * Executes the App Evaluation Workflow via the 4-step pattern: * find template → execute → poll → result */ import { config } from '../config/index.js'; import { Logger } from '../utils/logger.js'; import { handleExternalServiceError } from '../utils/errors.js'; import { fetchImageAsBase64, imageContentBlock, resourceLinkBlock, artifactResourceLinks } from '../utils/imageUtils.js'; import { DebuggAIServerClient } from '../services/index.js'; import { TunnelProvisionError } from '../services/tunnels.js'; import { resolveTargetUrl, buildContext, findExistingTunnel, ensureTunnel, sanitizeResponseUrls, touchTunnelById, } from '../utils/tunnelContext.js'; import { detectRepoName } from '../utils/gitContext.js'; import { tunnelManager } from '../services/ngrok/tunnelManager.js'; import { probeLocalPort, probeTunnelHealth } from '../utils/localReachability.js'; import { extractLocalhostPort } from '../utils/urlParser.js'; import { getCachedTemplateUuid, getCachedProjectUuid, invalidateTemplateCache, invalidateProjectCache, } from '../utils/handlerCaches.js'; import { isTransientWorkflowError, transientReasonTag } from '../utils/transientErrors.js'; import { Telemetry, TelemetryEvents } from '../utils/telemetry.js'; const logger = new Logger({ module: 'testPageChangesHandler' }); const TEMPLATE_NAME = 'app evaluation'; // Bead kbxy: bounded retry on known transient backend signatures (Pydantic // JSON parse errors, 502s, ECONNRESETs). Default 1 retry; env-overridable // up to 3 to balance reliability vs quota cost. Conservative: only retries // on documented transient patterns (utils/transientErrors.ts). function getMaxTransientRetries() { const raw = process.env.DEBUGGAI_TRANSIENT_RETRIES; if (raw === undefined || raw === '') return 1; const n = parseInt(raw, 10); if (!Number.isFinite(n) || n < 0) return 1; return Math.min(n, 3); } // Concurrency control — max 2 simultaneous browser checks. // Additional requests queue and run when a slot opens. const MAX_CONCURRENT = 2; let running = 0; const queue = []; async function acquireSlot() { if (running < MAX_CONCURRENT) { running++; return; } await new Promise((resolve) => queue.push({ resolve })); } function releaseSlot() { running--; const next = queue.shift(); if (next) { running++; next.resolve(); } } export async function testPageChangesHandler(input, context, progressCallback) { await acquireSlot(); try { return await testPageChangesHandlerInner(input, context, progressCallback); } finally { releaseSlot(); } } async function testPageChangesHandlerInner(input, context, rawProgressCallback) { const startTime = Date.now(); logger.toolStart('check_app_in_browser', input); // Bead 0bq: wrap the progress callback in a circuit-breaker so a single // client-side rejection of a stale progressToken (which would normally // throw up the stack and abort the handler, or — worse — arrive post-response // and tear down the stdio transport) is swallowed and disables further // emissions in this request. let progressDisabled = false; const progressCallback = rawProgressCallback ? async (update) => { if (progressDisabled) return; try { await rawProgressCallback(update); } catch (err) { progressDisabled = true; logger.warn('Progress emission failed; disabling further emissions for this request', { error: err instanceof Error ? err.message : String(err), }); } } : undefined; const client = new DebuggAIServerClient(config.api.key); await client.init(); const originalUrl = resolveTargetUrl(input); let ctx = buildContext(originalUrl); let keyId; const abortController = new AbortController(); const onStdinClose = () => { abortController.abort(); progressDisabled = true; // client is gone — stop emitting }; process.stdin.once('close', onStdinClose); // Progress budget: 3 setup steps + 25 execution steps = 28 total const SETUP_STEPS = 3; const MAX_EXEC_STEPS = 25; const TOTAL_STEPS = SETUP_STEPS + MAX_EXEC_STEPS; try { // --- Tunnel: reuse existing or provision a fresh one --- if (ctx.isLocalhost) { // Bead 1om: pre-flight local port probe BEFORE committing to backend // provision + ngrok session. If the user's dev server isn't listening, // fail in ~1.5s with a structured error instead of burning 5 minutes // on a browser agent trying to reach a dead tunnel. const localPort = extractLocalhostPort(ctx.originalUrl); if (typeof localPort === 'number') { const probe = await probeLocalPort(localPort); if (!probe.reachable) { const payload = { error: 'LocalServerUnreachable', message: `No server listening on 127.0.0.1:${localPort}. Start your dev server on that port before running check_app_in_browser. Probe result: ${probe.code} (${probe.detail ?? 'no detail'}).`, detail: { port: localPort, probeCode: probe.code, probeDetail: probe.detail, elapsedMs: probe.elapsedMs, }, }; logger.warn(`Pre-flight port probe failed for ${ctx.originalUrl}: ${probe.code} in ${probe.elapsedMs}ms`); return { content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }], isError: true }; } } if (config.devMode) { // Dev mode: local backend can reach localhost directly — no tunnel needed. logger.info(`check_app_in_browser: dev mode — using localhost URL directly: ${ctx.originalUrl}`); } else { if (progressCallback) { await progressCallback({ progress: 1, total: TOTAL_STEPS, message: 'Provisioning secure tunnel for localhost...' }); } const reused = findExistingTunnel(ctx); if (reused) { ctx = reused; logger.info(`Reusing tunnel: ${ctx.targetUrl} (id: ${ctx.tunnelId})`); } else { let tunnel; try { tunnel = await client.tunnels.provisionWithRetry(); } catch (provisionError) { const msg = provisionError instanceof Error ? provisionError.message : String(provisionError); const diag = provisionError instanceof TunnelProvisionError ? ` ${provisionError.diagnosticSuffix()}` : ''; throw new Error(`Failed to provision tunnel for ${ctx.originalUrl}. ` + `The remote browser needs a secure tunnel to reach your local dev server. ` + `Make sure your dev server is running on the specified port and try again. ` + `(Detail: ${msg})${diag}`); } keyId = tunnel.keyId; try { ctx = await ensureTunnel(ctx, tunnel.tunnelKey, tunnel.tunnelId, tunnel.keyId, () => client.revokeNgrokKey(tunnel.keyId)); } catch (tunnelError) { const msg = tunnelError instanceof Error ? tunnelError.message : String(tunnelError); throw new Error(`Tunnel creation failed for ${ctx.originalUrl}. ` + `Could not establish a secure connection between the remote browser and your local port. ` + `Verify your dev server is running and the port is accessible. ` + `(Detail: ${msg})`); } logger.info(`Tunnel ready: ${ctx.targetUrl} (id: ${ctx.tunnelId})`); } // Bead 1om: verify traffic actually flows through the tunnel. The // tunnel can be established (ngrok.connect returns OK) yet refuse // to forward traffic — e.g., IPv4/IPv6 bind mismatch, or the dev // server died between the pre-flight probe and here. Catch it now, // in ~1s, not via a 5-minute browser-agent false-pass. if (ctx.targetUrl) { const health = await probeTunnelHealth(ctx.targetUrl); if (!health.healthy) { const payload = { error: 'TunnelTrafficBlocked', message: `Tunnel was established but traffic isn't reaching the dev server. ${health.detail ?? ''} Common causes: dev server binds to 0.0.0.0 or ::1 but not 127.0.0.1; dev server crashed; firewall.`, detail: { code: health.code, status: health.status, ngrokErrorCode: health.ngrokErrorCode, elapsedMs: health.elapsedMs, }, }; logger.warn(`Tunnel health probe failed for ${ctx.targetUrl}: ${health.code} ${health.ngrokErrorCode ?? ''} in ${health.elapsedMs}ms`); // Tear down the broken tunnel so a subsequent call doesn't reuse it. // stopTunnel handles both owned (ngrok disconnect + key revoke) and // borrowed (just drops local ref) cases. if (ctx.tunnelId) { tunnelManager.stopTunnel(ctx.tunnelId).catch((err) => logger.warn(`Failed to stop broken tunnel ${ctx.tunnelId}: ${err}`)); } // keyId is consumed by stopTunnel's revoke path; clear so the // outer finally block doesn't double-revoke. keyId = undefined; return { content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }], isError: true }; } } } } // --- Resolve template + project in parallel (both independent post-tunnel) --- if (progressCallback) { await progressCallback({ progress: 2, total: TOTAL_STEPS, message: 'Locating evaluation workflow template...' }); } const repoName = input.repoName || detectRepoName(); const [templateUuid, projectUuid] = await Promise.all([ getCachedTemplateUuid(TEMPLATE_NAME, async () => { return client.workflows.findEvaluationTemplate(); }), repoName ? getCachedProjectUuid(repoName, async (repo) => { try { return await client.findProjectByRepoName(repo); } catch (err) { logger.warn(`Failed to look up project for repo "${repo}": ${err}`); return null; } }) : Promise.resolve(undefined), ]); if (!templateUuid) { throw new Error('App Evaluation Workflow Template not found. ' + 'Ensure the template is seeded in the backend (GET /api/v1/workflows/?is_template=true).'); } if (repoName && !projectUuid) { logger.info(`No project found for repo "${repoName}" — proceeding without project_id`); } // --- Build context data (camelCase here — axiosTransport auto-converts to snake_case) --- const contextData = { targetUrl: ctx.targetUrl ?? originalUrl, question: input.description, }; if (projectUuid) { contextData.projectId = projectUuid; } // --- Build env (credentials/environment) --- const env = {}; if (input.environmentId) env.environmentId = input.environmentId; if (input.credentialId) env.credentialId = input.credentialId; if (input.credentialRole) env.credentialRole = input.credentialRole; if (input.username) env.username = input.username; if (input.password) env.password = input.password; // --- Execute --- logger.info('Sending contextData', { contextData, env: Object.keys(env).length > 0 ? env : undefined }); if (progressCallback) { await progressCallback({ progress: 3, total: TOTAL_STEPS, message: 'Queuing workflow execution...' }); } // --- Execute + Poll (with bounded retry on transient errors, bead kbxy) --- // Progress phases (per attempt): // 1-3: MCP setup (tunnel, template, queue) — already sent above // 4-6: Backend setup (trigger, browser.setup, subworkflow starting) // 7-27: Agent steps (mapped from state.stepsTaken) // 28: Complete const BACKEND_SETUP_END = 6; const TERMINAL_STATUSES = new Set(['completed', 'failed', 'cancelled']); const MAX_RETRIES = getMaxTransientRetries(); let executeResponse; let executionUuid = ''; let finalExecution; let attempt = 0; while (true) { attempt++; if (attempt > 1) { // Retry path — emit telemetry + progress notification + brief backoff. Telemetry.capture(TelemetryEvents.WORKFLOW_TRANSIENT_RETRY, { tool: 'check_app_in_browser', attempt, reason: transientReasonTag(finalExecution), previousExecutionId: executionUuid, previousErrorMessage: finalExecution?.errorMessage?.slice(0, 200), previousStateError: finalExecution?.state?.error?.slice(0, 200), }); if (progressCallback) { await progressCallback({ progress: SETUP_STEPS, total: TOTAL_STEPS, message: `Transient backend error — retrying (attempt ${attempt}/${MAX_RETRIES + 1})...`, }); } await new Promise(r => setTimeout(r, 1000 * (attempt - 1))); } executeResponse = await client.workflows.executeWorkflow(templateUuid, contextData, Object.keys(env).length > 0 ? env : undefined); executionUuid = executeResponse.executionUuid; logger.info(`Execution queued: ${executionUuid}${attempt > 1 ? ` (retry ${attempt - 1}/${MAX_RETRIES})` : ''}`); // Closure state — reset PER ATTEMPT so progress numbers don't double-count // across retries. let lastStepsTaken = 0; let observedMaxSteps = MAX_EXEC_STEPS; finalExecution = await client.workflows.pollExecution(executionUuid, async (exec) => { // Keep the tunnel alive while the workflow is actively running if (ctx.tunnelId) touchTunnelById(ctx.tunnelId); const nodes = exec.nodeExecutions ?? []; const stepsTaken = Math.max(nodes.filter(n => n.nodeType === 'brain.step').length, exec.state?.stepsTaken ?? 0); if (stepsTaken !== lastStepsTaken) { lastStepsTaken = stepsTaken; logger.info(`Execution status: ${exec.status}, nodes: ${nodes.length}, steps: ${stepsTaken}`); } if (!progressCallback) return; // Bead 0bq: emit the final "Complete:" progress INSIDE this callback // when terminal status is detected. pollExecution will return on the // next line (line 183 in services/workflows.ts), so there's no // post-pollExecution progress emission that could race the response. if (TERMINAL_STATUSES.has(exec.status)) { const terminalOutcome = exec.state?.outcome ?? exec.status; await progressCallback({ progress: TOTAL_STEPS, total: TOTAL_STEPS, message: `Complete: ${terminalOutcome}`, }); return; } // --- Compute progress number --- let execProgress; let message; if (stepsTaken > 0) { // Agent is actively stepping — map into slots 7..27 if (stepsTaken > observedMaxSteps) observedMaxSteps = stepsTaken + 5; const stepSlots = TOTAL_STEPS - BACKEND_SETUP_END - 1; // 21 slots execProgress = BACKEND_SETUP_END + Math.max(1, Math.round((stepsTaken / observedMaxSteps) * stepSlots)); execProgress = Math.min(execProgress, TOTAL_STEPS - 1); // Use state.currentAction for the message (backend sends intent + actionType) const ca = exec.state?.currentAction; if (ca?.intent) { const action = ca.actionType ?? ca.action_type ?? 'working'; message = `Step ${stepsTaken}: [${action}] ${ca.intent}`; } else { message = `Agent evaluating... (step ${stepsTaken})`; } } else { // No agent steps yet — show backend setup progress from node transitions const hasSubworkflow = nodes.some(n => n.nodeType === 'subworkflow.run'); const hasBrowserSetup = nodes.some(n => n.nodeType === 'browser.setup'); const browserReady = nodes.some(n => n.nodeType === 'browser.setup' && n.status === 'success'); if (browserReady || hasSubworkflow) { execProgress = BACKEND_SETUP_END; message = 'Browser ready, agent starting...'; } else if (hasBrowserSetup) { execProgress = SETUP_STEPS + 2; message = 'Launching browser...'; } else if (nodes.length > 0) { execProgress = SETUP_STEPS + 1; message = 'Workflow triggered, preparing...'; } else { execProgress = SETUP_STEPS + 1; message = 'Waiting for execution to start...'; } } await progressCallback({ progress: execProgress, total: TOTAL_STEPS, message }); }, abortController.signal); // Decide retry vs exit: only retry on documented transient signatures // AND while we still have budget. Otherwise break and surface whatever // result the agent reached. if (attempt > MAX_RETRIES) break; if (!isTransientWorkflowError(finalExecution)) break; logger.warn(`Transient backend error detected (${transientReasonTag(finalExecution) ?? 'unknown'}) — ` + `retrying (attempt ${attempt + 1}/${MAX_RETRIES + 1})`); } const duration = Date.now() - startTime; // --- Format result --- const outcome = finalExecution.state?.outcome ?? finalExecution.status; const nodes = finalExecution.nodeExecutions ?? []; // subworkflow.run is the current graph shape — carries outcome, actionHistory, screenshot const subworkflowNode = nodes.find(n => n.nodeType === 'subworkflow.run'); // surfer.execute_task and brain.step/brain.evaluate are older graph shapes const surferNode = nodes.find(n => n.nodeType === 'surfer.execute_task'); // Action trace: brain.step nodes (old) → subworkflow.run actionHistory (new) const brainSteps = nodes .filter(n => n.nodeType === 'brain.step' && n.outputData) .sort((a, b) => a.executionOrder - b.executionOrder); const actionTrace = brainSteps.map((n, i) => { const d = n.outputData.decision ?? n.outputData; return { step: i + 1, action: d.actionType ?? d.action_type, intent: d.intent, target: d.target, value: d.value ?? undefined, success: n.outputData.success ?? n.status === 'success', durationMs: n.executionTimeMs, }; }); const subworkflowHistory = subworkflowNode?.outputData?.actionHistory; if (actionTrace.length === 0 && Array.isArray(subworkflowHistory) && subworkflowHistory.length > 0) { subworkflowHistory.forEach((step, i) => { actionTrace.push({ step: i + 1, action: step.actionType ?? step.action_type ?? step.action, intent: step.intent, target: step.target, value: step.value ?? undefined, success: step.success ?? true, durationMs: step.durationMs ?? step.duration_ms ?? undefined, }); }); } // Evaluation: brain.evaluate (old) → subworkflow.run outcome/success (new) const evalNode = nodes.find(n => n.nodeType === 'brain.evaluate'); let evaluation; if (evalNode?.outputData) { evaluation = { passed: evalNode.outputData.passed, outcome: evalNode.outputData.outcome, reason: evalNode.outputData.reason, verifications: evalNode.outputData.verifications, }; } else if (subworkflowNode?.outputData) { const sw = subworkflowNode.outputData; evaluation = { passed: sw.success, outcome: sw.outcome, reason: sw.error || undefined, }; } const stepsTaken = finalExecution.state?.stepsTaken ?? subworkflowNode?.outputData?.stepsTaken ?? actionTrace.length; const success = finalExecution.state?.success ?? subworkflowNode?.outputData?.success ?? false; const responsePayload = { outcome, success, status: finalExecution.status, stepsTaken, stepsBudget: MAX_EXEC_STEPS, // bead qmdd stepsRemaining: Math.max(0, MAX_EXEC_STEPS - (stepsTaken ?? 0)), // bead qmdd targetUrl: originalUrl, executionId: executionUuid, durationMs: finalExecution.durationMs ?? duration, }; // Bead jqmj: failureCategory disambiguates the three meanings of 'fail': // 'agent-error' — workflow/infra failure (Pydantic parse error, // backend exception, transport issue). Caller's // right move: retry-with-backoff. // 'assertion-mismatch' — agent ran the scenario but page state didn't // match expectations. Caller's right move: fix // code or update the test description. // ('page-error' is reserved for v2 — needs a structured signal from // backend to distinguish from assertion-mismatch reliably; today's // inferrable info is too fragile.) // Field is OMITTED on success (no failure to categorize). if (!success) { // state.error is the AGENT's narrative — it can describe assertion // failures ("expected heading to contain Welcome") OR infrastructure // failures ("Pydantic JSON parse error"). Without a structured signal, // we only count it as 'agent-error' when paired with workflow-level // failure (status='failed') or transient signature. // status='failed' or errorMessage set → workflow-level / transport error. const hasInfraFailure = finalExecution.status === 'failed' || !!finalExecution.errorMessage; responsePayload.failureCategory = hasInfraFailure ? 'agent-error' : 'assertion-mismatch'; } if (actionTrace.length > 0) responsePayload.actionTrace = actionTrace; if (evaluation) responsePayload.evaluation = evaluation; if (finalExecution.state?.error) responsePayload.agentError = finalExecution.state.error; if (finalExecution.errorMessage) responsePayload.errorMessage = finalExecution.errorMessage; if (finalExecution.errorInfo?.failedNodeId) responsePayload.failedNode = finalExecution.errorInfo.failedNodeId; if (executeResponse.resolvedEnvironmentId) responsePayload.resolvedEnvironmentId = executeResponse.resolvedEnvironmentId; if (executeResponse.resolvedCredentialId) responsePayload.resolvedCredentialId = executeResponse.resolvedCredentialId; if (surferNode?.outputData) { responsePayload.surferOutput = sanitizeResponseUrls(surferNode.outputData, ctx); } // Backend release 2026-04-25: browser_session block on execution detail // carries presigned S3 URLs for HAR + console log + recording. Pass through // verbatim — sanitizeResponseUrls below only strips ngrok hosts so S3 URLs // are preserved. Resolves client-feedback items #1 (network) + #7 (console). if (finalExecution.browserSession) { responsePayload.browserSession = finalExecution.browserSession; } logger.toolComplete('check_app_in_browser', duration); // NOTE (bead 0bq): the final "Complete:" progress is emitted INSIDE // pollExecution's onUpdate when terminal status is detected — see the // TERMINAL_STATUSES block above. Emitting it here (post-resolve) creates // a race where the progress can arrive AFTER the response on the wire, // making the client reject it as an unknown progressToken and close the // transport, breaking ALL subsequent tool calls. // Sanitize the whole payload so no tunnel URL leaks anywhere — including // agent-authored strings in actionTrace[*].intent, evaluation.reason, etc. const sanitizedPayload = sanitizeResponseUrls(responsePayload, ctx); const content = [ { type: 'text', text: JSON.stringify(sanitizedPayload, null, 2) }, ]; // Screenshot: check for already-base64 field first (subworkflow.run), then URL-based fields const SCREENSHOT_URL_KEYS = ['finalScreenshot', 'screenshot', 'screenshotUrl', 'screenshotUri']; const GIF_KEYS = ['runGif', 'gifUrl', 'gif', 'videoUrl', 'recordingUrl']; let screenshotEmbedded = false; let gifUrl = null; // subworkflow.run carries screenshotB64 directly — no fetch needed const screenshotB64 = subworkflowNode?.outputData?.screenshotB64; if (typeof screenshotB64 === 'string' && screenshotB64) { logger.info('Embedding inline base64 screenshot from subworkflow.run'); content.push(imageContentBlock(screenshotB64, 'image/png')); screenshotEmbedded = true; } let screenshotUrl = null; for (const node of nodes) { const data = node.outputData ?? {}; if (!screenshotEmbedded && !screenshotUrl) { for (const key of SCREENSHOT_URL_KEYS) { if (typeof data[key] === 'string' && data[key]) { screenshotUrl = data[key]; break; } } } if (!gifUrl) { for (const key of GIF_KEYS) { if (typeof data[key] === 'string' && data[key]) { gifUrl = data[key]; break; } } } if ((screenshotEmbedded || screenshotUrl) && gifUrl) break; } if (!screenshotEmbedded && screenshotUrl) { logger.info(`Embedding screenshot: ${screenshotUrl}`); const img = await fetchImageAsBase64(screenshotUrl).catch(() => null); if (img) content.push(imageContentBlock(img.data, img.mimeType)); } // Artifact links (bead 8qndk): run recording (legacy GIF field) + the // browserSession presigned URLs (HAR / console log / recording). Returned as // resource_links, not base64-inlined. Screenshots stay inline above so // vision-capable clients still SEE them. const artifactLinks = [ ...(gifUrl ? [resourceLinkBlock(gifUrl, 'run-recording.gif', { mimeType: 'image/gif', title: 'Run recording', description: 'Animated recording of the run (presigned URL — open or fetch on demand).', })] : []), ...artifactResourceLinks(sanitizedPayload.browserSession), ]; const seenArtifactUris = new Set(); for (const link of artifactLinks) { if (link.uri && !seenArtifactUris.has(link.uri)) { seenArtifactUris.add(link.uri); content.push(link); } } return { content }; } catch (error) { const duration = Date.now() - startTime; logger.toolError('check_app_in_browser', error, duration); if (error instanceof Error && (error.message.includes('not found') || error.message.includes('401'))) { invalidateTemplateCache(); invalidateProjectCache(); } throw handleExternalServiceError(error, 'DebuggAI', 'test execution'); } finally { process.stdin.removeListener('close', onStdinClose); // Tunnel is intentionally NOT torn down here — tunnelManager reuses it on // subsequent calls to the same port and auto-shutoffs after 55 min idle. // Process-exit cleanup happens via stopAllTunnels() in the SIGINT/SIGTERM // handlers in index.ts. if (!ctx.tunnelId && keyId) { // Provisioned a key but tunnel creation failed — revoke the orphaned key. client.revokeNgrokKey(keyId).catch(err => logger.warn(`Failed to revoke unused ngrok key ${keyId}: ${err}`)); } } }