UNPKG

donobu

Version:

Create browser automations with an LLM agent and replay them as Playwright scripts.

1,168 lines 85.5 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.TRIAGE_PERSISTENCE_FILE_IDS = exports.TreatmentPlan = exports.RemediationStepSchema = exports.FailureReasonSchema = exports.AdditionalDataRequestSchema = void 0; exports.deriveHeuristicAssessment = deriveHeuristicAssessment; exports.deriveHistoricalSignals = deriveHistoricalSignals; exports.gatherTestFailureEvidence = gatherTestFailureEvidence; exports.generateTreatmentPlanFromEvidence = generateTreatmentPlanFromEvidence; exports.reconcileTreatmentPlan = reconcileTreatmentPlan; exports.summarizeFlowHistory = summarizeFlowHistory; const crypto_1 = require("crypto"); const fs = __importStar(require("fs/promises")); const path = __importStar(require("path")); const typescript_1 = __importDefault(require("typescript")); const v4_1 = require("zod/v4"); const AnalyzePageTextTool_1 = require("../../../tools/AnalyzePageTextTool"); const MarkObjectiveCompleteTool_1 = require("../../../tools/MarkObjectiveCompleteTool"); const MarkObjectiveNotCompletableTool_1 = require("../../../tools/MarkObjectiveNotCompletableTool"); const SummarizeLearningsTool_1 = require("../../../tools/SummarizeLearningsTool"); const Logger_1 = require("../../../utils/Logger"); const cacheLocator_1 = require("../../ai/cache/cacheLocator"); /** * # Test Failure Triage System * * Transforms Playwright test failures in Donobu-powered test suites into structured, * actionable **treatment plans**. A treatment plan tells both humans and automation *why* * the test failed, *how confident* the system is in that diagnosis, and *what to do next* * — whether that is retrying the automation, deleting a stale page.ai cache, filing a * product bug, or updating selectors in the test code. * * --- * * ## Architecture Overview * * The system operates in two phases that run in sequence: * * ### Phase 1 — Evidence Collection (`gatherTestFailureEvidence`) * * Called automatically by the Donobu test extension (`testExtension.ts`) in the * Playwright `afterEach` hook whenever a test fails. This phase: * * 1. Extracts error messages, stack traces, and assertion details from `TestInfo`. * 2. Loads the Donobu flow metadata (objective, run mode, state) and recent tool call * history from the persistence layer. * 3. Fetches **historical runs** of the same flow (by name) from the flows manager to * detect flakiness, regression patterns, and prior self-heal success. * 4. Captures the **failure screenshot** (last tool call screenshot from the current * run) and the **baseline screenshot** (last tool call screenshot from the most * recent successful historical run) for visual comparison. * 5. Reads the source of the failing test case for contextual grounding. * 6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses * rule-based pattern matching over errors, tool calls, stale-cache indicators, * and historical signals to produce a preliminary diagnosis — including a failure * reason, confidence score, and retry recommendation. * 7. Persists the complete evidence bundle (JSON + screenshots) to disk as a * `FailureEvidenceRecord`. * * ### Phase 2 — Treatment Plan Generation (`generateTreatmentPlanFromEvidence`) * * Called by the Donobu CLI (`donobu-cli.ts`) after evidence files are collected. This * phase reads the persisted evidence and **requires a GPT client** — there is no * heuristic-only fallback path. It: * * 1. Sends the full evidence bundle — including screenshots as vision input — to a * GPT model with a detailed system prompt, requesting a structured `TreatmentPlan` * response. * 2. **Reconciles** the GPT plan with heuristic signals (`reconcileTreatmentPlan`) to * enforce invariants the LLM might miss (e.g., forcing `shouldRetryAutomation` when * historical data shows prior self-heal success, or overriding retry-step priority * for stale-cache scenarios). * 3. Returns the final `TreatmentPlan` for the CLI to act on — potentially triggering * automatic retries, cache deletion, or surfacing remediation steps to the engineer. * * --- * * ## Data Signals * * The triage system draws from several complementary data sources, each targeting * different failure modes: * * | Signal | Source | What it reveals | * |-------------------------|-------------------------------|----------------------------------------------------| * | Error messages & stacks | `TestInfo.errors` | Direct cause (assertion, timeout, selector) | * | Tool call history | `FlowsPersistence` | What actions the AI took and their outcomes | * | Tool call parameters | `ToolCall.parameters` | Exact selectors, URLs, and inputs attempted | * | Flow metadata | `DonobuExtendedPage._dnb` | Run mode, objective, allowed tools, timing | * | Stale cache indicators | Derived from above | Whether page.ai cache staleness is the root cause | * | Historical flow runs | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal | * | Failure screenshot | Last tool call screenshot | Visual state of the page when the failure occurred | * | Baseline screenshot | Last successful run's screenshot | Visual reference for what the page *should* look like | * | Test source snippet | TypeScript AST parsing | The test's expectations and structure | * * --- * * ## Failure Classification * * Every treatment plan assigns one of the following failure reasons: * * - `SELECTOR_REGRESSION` — UI locators have gone stale. * - `STALE_CACHE_OR_INSTRUCTIONS` — The page.ai deterministic cache is outdated. * - `TIMING_OR_SYNCHRONISATION` — Race conditions, slow loads, or flaky waits. * - `NETWORK_OR_DEPENDENCY` — External service failures or connectivity issues. * - `APPLICATION_DEFECT` — A real bug in the product under test. * - `ASSERTION_DRIFT` — Test expectations no longer match valid application behavior. * - `AUTOMATION_SCRIPT_ISSUE` — The test script itself is incorrect. * - `AUTHENTICATION_FAILURE` — Session/auth problems prevented the test from running. * - `ENVIRONMENT_CONFIGURATION` — Infrastructure or environment misconfiguration. * - `TEST_DATA_UNAVAILABLE` — Required test data is missing or invalid. * - `UNKNOWN` — Insufficient signal to determine the cause. * * --- * * ## Getting the Most Out of This System * * ### 1. Name your flows consistently * * Historical analysis works by matching flows by name. If every test uses a unique, * stable flow name, the system can compare the current failure against all prior runs * of the same flow and detect flakiness, regressions, and self-heal patterns: * * ```ts * test('checkout flow adds item and completes purchase', async ({ page }) => { * const ai = await page.ai('Checkout — add item and purchase', { ... }); * // ... * }); * ``` * * ### 2. Let evidence persist to disk * * The default behavior writes evidence JSON and screenshots to the run directory. This * enables the CLI's Phase 2 to enrich the diagnosis with GPT and visual comparison. * Ensure `DONOBU_TRIAGE_DISABLED` is not set, and that `runDirectory` is writable: * * ```ts * // Evidence is gathered automatically on failure — no extra code needed. * // To customize the output directory: * await gatherTestFailureEvidence(testInfo, page, { * runDirectory: '/path/to/custom/output', * }); * ``` * * ### 3. Ensure a GPT client is available * * A GPT client is **required** for treatment plan generation. The LLM performs semantic * reasoning: it reads the test source, interprets tool call parameters, compares * screenshots visually, and produces human-readable remediation steps. The CLI * instantiates a GPT client automatically from configured credentials. * * ### 4. Use deterministic (cached) mode for stable flows * * When flows run in `DETERMINISTIC` mode with a page.ai cache, the triage system * activates its stale-cache detection pipeline — a composite scoring system that * weighs whether the cached instructions have gone stale versus whether the failure * is a legitimate test issue. This is the system's strongest diagnostic capability. * * ### 5. Inspect the evidence files * * Each failure produces a `failure-evidence-<id>.json` file (plus optional PNG * screenshots) in the run directory. These files are self-contained and can be * re-processed, shared for debugging, or fed back into `generateTreatmentPlanFromEvidence` * independently. * * --- * * ## Key Exports * * - `gatherTestFailureEvidence` — Phase 1 entry point. Call from a Playwright afterEach hook. * - `generateTreatmentPlanFromEvidence` — Phase 2 entry point. Requires a `GptClient` and a * `FailureEvidenceRecord`. * - `TreatmentPlan` — The Zod schema defining the treatment plan structure. * - `FailureReasonSchema` — The Zod enum of all possible failure classifications. */ const FailureReasonSchema = v4_1.z .enum([ 'UNKNOWN', 'AUTOMATION_SCRIPT_ISSUE', 'SELECTOR_REGRESSION', 'TIMING_OR_SYNCHRONISATION', 'ASSERTION_DRIFT', 'APPLICATION_DEFECT', 'AUTHENTICATION_FAILURE', 'ENVIRONMENT_CONFIGURATION', 'TEST_DATA_UNAVAILABLE', 'NETWORK_OR_DEPENDENCY', ]) .describe(`UNKNOWN: Triggered when no concrete root cause can be inferred. AUTOMATION_SCRIPT_ISSUE: The scripted steps are incorrect or incomplete. SELECTOR_REGRESSION: UI element locators have changed and need updates. TIMING_OR_SYNCHRONISATION: Wait conditions or timing assumptions failed. ASSERTION_DRIFT: The expected outcomes in assertions no longer match reality. APPLICATION_DEFECT: The product behaviour is broken and must be fixed upstream. AUTHENTICATION_FAILURE: Login, MFA, or session preconditions were not met. ENVIRONMENT_CONFIGURATION: Test infra or env vars are misconfigured. TEST_DATA_UNAVAILABLE: Seed data or fixtures are missing or expired. NETWORK_OR_DEPENDENCY: External services or network connectivity failed.`); exports.FailureReasonSchema = FailureReasonSchema; const RemediationCategorySchema = v4_1.z .enum([ 'RETRY_AUTOMATION', 'UPDATE_TEST_LOGIC', 'UPDATE_SELECTORS', 'ADJUST_TIMING', 'REFINE_ASSERTIONS', 'FIX_APPLICATION', 'VALIDATE_AUTHENTICATION', 'CHECK_ENVIRONMENT', 'REFRESH_TEST_DATA', 'STABILIZE_DEPENDENCIES', 'ESCALATE_MANUAL_REVIEW', 'UNKNOWN', ]) .describe('Categorises the type of remediation that should be attempted so that downstream systems can pick appropriate playbooks.'); const RemediationStepSchema = v4_1.z.object({ category: RemediationCategorySchema, summary: v4_1.z .string() .describe('Short actionable label describing what must happen next.'), details: v4_1.z .string() .describe('Specific guidance for performing the remediation step.'), }); exports.RemediationStepSchema = RemediationStepSchema; const AdditionalDataRequestSchema = v4_1.z.object({ description: v4_1.z .string() .describe('Information that would materially help confirm the root cause.'), suggestedSources: v4_1.z .array(v4_1.z.string()) .describe('Where to look for the requested data.') .default([]), }); exports.AdditionalDataRequestSchema = AdditionalDataRequestSchema; const AutomationDirectivesSchema = v4_1.z .object({ clearPageAiCache: v4_1.z .boolean() .describe('When true, clear cached Page.AI selectors before attempting an automated retry.') .optional(), targetTestFile: v4_1.z .string() .describe('Relative path to the Playwright spec that should be re-run when applying this plan.') .optional(), targetProject: v4_1.z .string() .describe('Playwright project name that should be used when re-running automation for this failure.') .optional(), additionalPlaywrightArgs: v4_1.z .array(v4_1.z.string()) .describe('Extra Playwright CLI arguments the orchestrator should append when applying this plan.') .optional(), }) .partial(); const TreatmentPlan = v4_1.z.object({ failureSummary: v4_1.z .string() .describe('A summary of the nature of the test failure'), failureReason: FailureReasonSchema, confidence: v4_1.z .number() .min(0) .max(1) .describe('Confidence score between 0 and 1 estimating how likely the failureReason classification is correct.'), observedIndicators: v4_1.z .array(v4_1.z.string()) .describe('Signals and observations that led to the triage conclusion.') .default([]), remediationSteps: v4_1.z .array(RemediationStepSchema) .describe('Ordered remediation steps to attempt.') .default([]), additionalDataRequests: v4_1.z .array(AdditionalDataRequestSchema) .describe('Extra context that would help if remediation stalls.') .default([]), shouldRetryAutomation: v4_1.z .boolean() .describe('True if the automation framework should attempt another run.'), requiresCodeChange: v4_1.z .boolean() .describe('True when the Playwright test code likely needs updates.'), requiresProductFix: v4_1.z .boolean() .describe('True when an application-level defect is suspected.'), notes: v4_1.z .string() .describe('Optional free-form notes that do not fit the structured fields.') .optional(), automationDirectives: AutomationDirectivesSchema.optional(), }); exports.TreatmentPlan = TreatmentPlan; const MAX_SERIALIZED_STRING_LENGTH = 10000; const MAX_TOOL_CALL_PARAMETERS_LENGTH = 500; const MAX_TOOL_CALLS_TO_INCLUDE = 12; const TRIAGE_EVIDENCE_SCHEMA_VERSION = 2; const TRIAGE_PERSISTENCE_FILE_IDS = { evidence: 'triage-evidence.json', failureScreenshot: 'triage-failure-screenshot.png', baselineScreenshot: 'triage-baseline-screenshot.png', }; exports.TRIAGE_PERSISTENCE_FILE_IDS = TRIAGE_PERSISTENCE_FILE_IDS; /** * Ensures text blobs captured from tool calls or error messages fit within storage * limits without losing useful context by padding or truncation artifacts. */ function truncateString(value, maxLength) { if (value.length <= maxLength) { return value; } if (maxLength <= 3) { return value.slice(0, maxLength); } return `${value.slice(0, maxLength - 3)}...`; } /** * Serialises arbitrary values while constraining nested strings and the overall * payload so that GPT prompts and persistence never exceed downstream quotas. */ function safeStringify(value, maxLength, stringMaxLength = 500) { try { const serialized = JSON.stringify(value, (_, innerValue) => { if (typeof innerValue === 'string') { return truncateString(innerValue, stringMaxLength); } return innerValue; }, 2); return truncateString(serialized, maxLength); } catch (error) { const err = error; return `Failed to stringify: ${err.name}: ${err.message}`; } } /** * Strips sensitive or oversized fields from the persisted flow metadata so the * triage agent receives only the contextual attributes it can safely reason over. */ function sanitizeFlowMetadata(metadata) { if (!metadata) { return null; } return { id: metadata.id, name: metadata.name, runMode: metadata.runMode, state: metadata.state, targetWebsite: metadata.web?.targetWebsite ?? '', overallObjective: metadata.overallObjective, allowedTools: metadata.allowedTools, envVars: metadata.envVars, startedAt: metadata.startedAt, completedAt: metadata.completedAt, maxToolCalls: metadata.maxToolCalls, gptConfigName: metadata.gptConfigName, defaultMessageDuration: metadata.defaultMessageDuration, resultSummary: metadata.result ? JSON.stringify(metadata.result) : null, }; } /** * Condenses the Donobu tool invocation history so the most recent calls and their * outcomes can influence the triage decision without overwhelming the prompt. */ function summarizeToolCalls(toolCalls) { return toolCalls .filter((toolCall) => toolCall.outcome !== null && toolCall.completedAt !== null) .slice(-MAX_TOOL_CALLS_TO_INCLUDE) .map((toolCall) => ({ id: toolCall.id, toolName: toolCall.toolName, success: toolCall.outcome.isSuccessful, outcomeSummary: toolCall.outcome.forLlm, durationMs: toolCall.completedAt - toolCall.startedAt, page: toolCall.page, startedAtIso: new Date(toolCall.startedAt).toISOString(), completedAtIso: new Date(toolCall.completedAt).toISOString(), parameters: safeStringify(toolCall.parameters, MAX_TOOL_CALL_PARAMETERS_LENGTH, 200), })); } const HISTORY_QUERY_WINDOW_DAYS = 14; const HISTORY_QUERY_LIMIT = 20; const HISTORY_RECENT_RUNS_CAP = 10; /** * Compresses a set of historical flow runs into an aggregate summary compact * enough for both heuristic reasoning and inclusion in GPT prompts. */ function summarizeFlowHistory(flowName, flows) { const sorted = [...flows].sort((a, b) => (b.startedAt ?? 0) - (a.startedAt ?? 0)); let successCount = 0; let failureCount = 0; let otherCount = 0; for (const flow of sorted) { if (flow.state === 'SUCCESS') { successCount++; } else if (flow.state === 'FAILED') { failureCount++; } else { otherCount++; } } const totalRuns = sorted.length; const passRate = totalRuns > 0 ? successCount / totalRuns : 0; const recentRuns = sorted .slice(0, HISTORY_RECENT_RUNS_CAP) .map((f) => ({ id: f.id, state: f.state, runMode: f.runMode, startedAt: f.startedAt, completedAt: f.completedAt, durationMs: f.startedAt !== null && f.completedAt !== null ? f.completedAt - f.startedAt : null, })); let streakState = 'MIXED'; let streakLength = 0; if (recentRuns.length > 0) { const firstState = recentRuns[0].state; if (firstState === 'SUCCESS' || firstState === 'FAILED') { streakState = firstState; streakLength = 1; for (let i = 1; i < recentRuns.length; i++) { if (recentRuns[i].state === firstState) { streakLength++; } else { break; } } } } const lastSuccessfulRun = sorted.find((f) => f.state === 'SUCCESS'); return { flowName, totalRuns, successCount, failureCount, otherCount, passRate, recentRuns, currentStreak: { state: streakState, length: streakLength }, lastSuccessfulRunId: lastSuccessfulRun?.id ?? null, queryWindowDays: HISTORY_QUERY_WINDOW_DAYS, queriedAt: new Date().toISOString(), }; } /** * Derives actionable signals from historical flow run data to feed into the * heuristic classifier: flakiness, regression likelihood, prior self-heal * success, and whether the page.ai cache was recently validated. */ function deriveHistoricalSignals(history) { const { recentRuns, passRate } = history; // Flakiness: proportion of state transitions in recent runs let transitions = 0; for (let i = 1; i < recentRuns.length; i++) { if (recentRuns[i].state !== recentRuns[i - 1].state) { transitions++; } } const maxTransitions = Math.max(1, recentRuns.length - 1); const flakinessScore = recentRuns.length > 1 ? transitions / maxTransitions : 0; // Regression: consecutive failures at the head after passes existed let regressionLikelihood = 0; if (history.currentStreak.state === 'FAILED' && history.successCount > 0) { const streakWeight = Math.min(history.currentStreak.length / 5, 1); regressionLikelihood = Math.min(1, passRate * 0.6 + streakWeight * 0.4); } // Prior self-heal: a FAILED run immediately followed by SUCCESS (newest-first) let priorSelfHealSuccess = false; for (let i = 0; i < recentRuns.length - 1; i++) { if (recentRuns[i].state === 'SUCCESS' && recentRuns[i + 1].state === 'FAILED') { priorSelfHealSuccess = true; break; } } // Cache recently valid: any recent DETERMINISTIC SUCCESS const cacheWasRecentlyValid = recentRuns.some((r) => r.runMode === 'DETERMINISTIC' && r.state === 'SUCCESS'); return { flakinessScore, regressionLikelihood, recentPassRate: passRate, priorSelfHealSuccess, cacheWasRecentlyValid, }; } /** * Fetches historical runs of the same flow by name and returns a compact * summary. Runs concurrently with other evidence collection and fails open * so triage proceeds even if the history lookup encounters errors. */ async function fetchFlowHistory(page) { const flowName = page._dnb?.donobuFlowMetadata?.name; if (!flowName) { return null; } const flowsManager = page._dnb?.donobuStack?.flowsManager; if (!flowsManager) { return null; } const cutoffMs = Date.now() - HISTORY_QUERY_WINDOW_DAYS * 24 * 60 * 60 * 1000; const currentFlowId = page._dnb.donobuFlowMetadata.id; try { const result = await flowsManager.getFlows({ name: flowName, startedAfter: cutoffMs, limit: HISTORY_QUERY_LIMIT, }); const historicalFlows = result.items.filter((f) => f.id !== currentFlowId); if (historicalFlows.length === 0) { return null; } return summarizeFlowHistory(flowName, historicalFlows); } catch (error) { Logger_1.appLogger.warn(`Failed to fetch historical flow data for "${flowName}", proceeding without history.`, error); return null; } } /** * Retrieves the screenshot from the last completed tool call in the current flow. * Returns the raw PNG/JPEG buffer if available, or null. Fails open so triage * proceeds even if the screenshot cannot be loaded. */ async function fetchLastToolCallScreenshot(page) { const flowId = page._dnb?.donobuFlowMetadata?.id; const persistence = page._dnb?.persistence; if (!flowId || !persistence) { return null; } try { const toolCalls = await persistence.getToolCalls(flowId); if (toolCalls.length === 0) { return null; } // Walk backwards to find the last tool call with a screenshot for (let i = toolCalls.length - 1; i >= 0; i--) { const screenshotId = toolCalls[i].postCallImageId; if (screenshotId) { return await persistence.getScreenShot(flowId, screenshotId); } } return null; } catch (error) { Logger_1.appLogger.debug(`Failed to fetch last tool call screenshot for flow ${flowId}.`, error); return null; } } /** * Loads the final screenshot from a historical successful run to serve as a * visual baseline for comparison with the current failure state. This enables * the GPT triage agent to detect page redesigns and stale cache scenarios by * comparing "what the page looked like when it last worked" vs "what it looks * like now." Fails open — returns null if the screenshot cannot be retrieved. */ async function fetchBaselineScreenshot(page, historicalFlowId) { const persistence = page._dnb?.persistence; if (!persistence) { return null; } try { const toolCalls = await persistence.getToolCalls(historicalFlowId); if (toolCalls.length === 0) { return null; } for (let i = toolCalls.length - 1; i >= 0; i--) { const screenshotId = toolCalls[i].postCallImageId; if (screenshotId) { return await persistence.getScreenShot(historicalFlowId, screenshotId); } } return null; } catch (error) { Logger_1.appLogger.debug(`Failed to fetch baseline screenshot from historical flow ${historicalFlowId}.`, error); return null; } } /** * Loads the failing Playwright test file and extracts the statement block that * defines the target test case so the triage agent can corroborate expectations. */ async function extractTestCaseSnippet(testFilePath, testName) { if (!testFilePath) { return null; } try { const sourceCode = await fs.readFile(testFilePath, 'utf8'); const sourceFile = typescript_1.default.createSourceFile(testFilePath, sourceCode, typescript_1.default.ScriptTarget.Latest, true); let snippet = null; const visit = (node) => { if (snippet) { return; } if (typescript_1.default.isExpressionStatement(node) && typescript_1.default.isCallExpression(node.expression)) { const expression = node.expression.expression; if ((typescript_1.default.isIdentifier(expression) && (expression.text === 'test' || expression.text === 'it')) || (typescript_1.default.isPropertyAccessExpression(expression) && typescript_1.default.isIdentifier(expression.expression) && expression.expression.text === 'test')) { const args = node.expression.arguments; if (args.length > 0 && typescript_1.default.isStringLiteral(args[0])) { const title = args[0].text; if (title === testName || testName.includes(title) || title.includes(testName)) { snippet = sourceCode.substring(node.pos, node.end).trim(); return; } } } } typescript_1.default.forEachChild(node, visit); }; visit(sourceFile); if (!snippet) { return null; } return truncateString(snippet, MAX_SERIALIZED_STRING_LENGTH); } catch (error) { Logger_1.appLogger.warn(`Failed to extract test case snippet from ${testFilePath}`, error); return null; } } /** * Normalises the heterogeneous error structures Playwright can emit into * concise summaries that the LLM can rank and cross-reference with history. */ function buildErrorSummaries(testInfo) { const rawErrors = (testInfo.errors ?? []).length > 0 ? testInfo.errors : testInfo.error ? [testInfo.error] : []; return rawErrors.map((err) => { const summary = {}; if (typeof err?.message === 'string') { summary.message = truncateString(err.message, 2000); } if (typeof err?.stack === 'string') { summary.stack = truncateString(err.stack, 2000); } if (err?.name) { summary.name = String(err.name); } if (err?.value !== undefined) { summary.value = safeStringify(err.value, 2000); } if (err?.actual !== undefined) { summary.actual = safeStringify(err.actual, 1000); } if (err?.expected !== undefined) { summary.expected = safeStringify(err.expected, 1000); } if (err?.location) { summary.location = safeStringify(err.location, 500); } if (typeof err?.snippet === 'string') { summary.snippet = truncateString(err.snippet, 1000); } return summary; }); } /** * Translates an inferred failure reason into a sequenced set of remediation * actions so downstream automation and humans receive concrete next steps. */ function remediationStepsForReason(reason, context = {}) { switch (reason) { case 'AUTOMATION_SCRIPT_ISSUE': return [ { category: 'UPDATE_TEST_LOGIC', summary: 'Inspect the failing automation logic.', details: `Review the Playwright test and any Donobu tool invocations around the failure. Align the scripted steps with the intended business flow.`, }, { category: 'RETRY_AUTOMATION', summary: 'Retry after updating the automation.', details: 'Re-run the test or Donobu flow once the script adjustments are in place to validate the fix.', }, ]; case 'SELECTOR_REGRESSION': if (context.occurredDuringPageAi) { return [ { category: 'RETRY_AUTOMATION', summary: 'Delete the test cache and retry the test.', details: `Delete the cached ${cacheLocator_1.PAGE_AI_CACHE_DIRNAME}/<spec-file>${cacheLocator_1.PAGE_AI_CACHE_FILE_EXTENSION} entry for this test so page.ai recalculates selectors against the live DOM, then rerun the automation to verify recovery.`, }, { category: 'UPDATE_SELECTORS', summary: 'Update selectors if the autonomous retry still fails.', details: 'If cache invalidation and autonomous retry still fail, fall back to manually adjusting the selector strategy.', }, ]; } else { return [ { category: 'UPDATE_SELECTORS', summary: 'Refresh selectors for the affected elements.', details: 'Use page.find failovers or Playwright locators to update the targeting strategy for the broken element.', }, { category: 'RETRY_AUTOMATION', summary: 'Validate selectors by re-running the test.', details: 'Execute the test or self-healing run to confirm the new selectors resolve the regression.', }, ]; } case 'TIMING_OR_SYNCHRONISATION': return [ { category: 'ADJUST_TIMING', summary: 'Stabilise async waits and retry logic.', details: 'Add explicit waits, polling, or guard conditions so the automation aligns with the application response times.', }, { category: 'RETRY_AUTOMATION', summary: 'Run the test after timing adjustments.', details: 'Execute an automation retry to ensure the timing changes eliminate the flake.', }, ]; case 'ASSERTION_DRIFT': return [ { category: 'REFINE_ASSERTIONS', summary: 'Revisit expected outcomes and test assertions.', details: 'Cross-check the assertion expectations against the latest product behaviour and update the checks accordingly.', }, { category: 'RETRY_AUTOMATION', summary: 'Confirm updated assertions.', details: 'Execute the test again once assertions have been updated to verify alignment with the application.', }, ]; case 'APPLICATION_DEFECT': return [ { category: 'FIX_APPLICATION', summary: 'Log and prioritise the suspected product defect.', details: 'Capture reproduction steps using the failing automation and escalate to the owning development team.', }, { category: 'ESCALATE_MANUAL_REVIEW', summary: 'Coordinate QA verification of the fix.', details: 'Have QA validate the defect manually and confirm once the product change is deployed.', }, { category: 'RETRY_AUTOMATION', summary: 'Re-run automation after the product fix.', details: 'Execute the test to confirm the application change resolves the failure.', }, ]; case 'AUTHENTICATION_FAILURE': return [ { category: 'VALIDATE_AUTHENTICATION', summary: 'Verify credentials and auth flows.', details: 'Check login secrets, MFA configuration, and session state preconditions for the test environment.', }, { category: 'RETRY_AUTOMATION', summary: 'Run after auth prerequisites are restored.', details: 'Execute the test once authentication is confirmed to be working.', }, ]; case 'ENVIRONMENT_CONFIGURATION': return [ { category: 'CHECK_ENVIRONMENT', summary: 'Inspect environment and configuration.', details: 'Validate environment variables, feature flags, and infrastructure dependencies referenced by the test.', }, { category: 'RETRY_AUTOMATION', summary: 'Re-run once environment is stable.', details: 'Execute automation after configuration corrections to confirm stability.', }, ]; case 'TEST_DATA_UNAVAILABLE': return [ { category: 'REFRESH_TEST_DATA', summary: 'Restore or seed required test data.', details: 'Populate fixtures, reset accounts, or refresh records relied upon by the test.', }, { category: 'RETRY_AUTOMATION', summary: 'Run after data restoration.', details: 'Execute automation with the refreshed data to ensure the flow passes.', }, ]; case 'NETWORK_OR_DEPENDENCY': return [ { category: 'STABILIZE_DEPENDENCIES', summary: 'Check external services or network health.', details: 'Verify the availability and latency of downstream services, APIs, or network connections.', }, { category: 'RETRY_AUTOMATION', summary: 'Retry once dependencies recover.', details: 'Re-run the test when network conditions or dependency status return to normal.', }, ]; case 'UNKNOWN': default: return [ { category: 'ESCALATE_MANUAL_REVIEW', summary: 'Perform deeper manual triage.', details: 'Inspect Playwright traces, Donobu tool history, and application logs to narrow down the root cause.', }, { category: 'RETRY_AUTOMATION', summary: 'Retry once additional context is gathered.', details: 'After manual analysis, attempt another automation run to see if the issue reproduces consistently.', }, ]; } } /** * Specifies follow-up context the triage agent should ask for when evidence is * thin, keeping human responders focused on the data that unblocks a fix fastest. */ function additionalDataRequestsForReason(reason, _context = {}) { switch (reason) { case 'SELECTOR_REGRESSION': return [ { description: 'Collect DOM snapshots or screenshots around the failing selector.', suggestedSources: [ 'Playwright trace viewer', 'Donobu tool call screenshots', ], }, ]; case 'TIMING_OR_SYNCHRONISATION': return [ { description: 'Gather network and performance timings for the affected actions.', suggestedSources: [ 'Browser devtools performance logs', 'Backend request metrics', ], }, ]; case 'APPLICATION_DEFECT': return [ { description: 'Capture backend logs or Sentry events around the failure window.', suggestedSources: ['Application logging platform', 'APM traces'], }, ]; case 'AUTHENTICATION_FAILURE': return [ { description: 'Validate authentication tokens and secrets used by the test.', suggestedSources: ['Secret manager', 'Identity provider logs'], }, ]; case 'ENVIRONMENT_CONFIGURATION': return [ { description: 'Review environment variable values and feature flag states.', suggestedSources: ['Deployment configuration', 'Infra dashboards'], }, ]; case 'TEST_DATA_UNAVAILABLE': return [ { description: 'Check the lifecycle of the test accounts or fixtures.', suggestedSources: [ 'Test data management system', 'Database snapshots', ], }, ]; case 'NETWORK_OR_DEPENDENCY': return [ { description: 'Inspect dependency uptime and recent incidents.', suggestedSources: ['Status pages', 'Network monitoring dashboards'], }, ]; default: return [ { description: 'Review Playwright trace, Donobu flow metadata, and browser console logs.', suggestedSources: [ 'Playwright trace viewer', 'Donobu persistence layer', ], }, ]; } } /** * Applies lightweight heuristics across Playwright errors and Donobu tool logs * to produce a first-pass failure classification and supporting evidence trail. */ function inferFailureReason(errorSummaries, toolCalls) { const combinedText = [ ...errorSummaries.map((err) => err.message ?? ''), ...errorSummaries.map((err) => err.stack ?? ''), ...toolCalls.map((tc) => tc.outcomeSummary), ] .filter(Boolean) .join('\n') .toLowerCase(); const evidence = []; const matches = (pattern) => pattern.test(combinedText); // LocateException: AI-powered element location failed if (matches(/locateexception|failed to locate element/i)) { if (matches(/no.?matches/i)) { evidence.push('page.ai.locate() could not find any element matching the description.'); } else if (matches(/too.?many.?matches/i)) { evidence.push('page.ai.locate() matched too many elements and could not disambiguate.'); } else { evidence.push('page.ai.locate() failed to resolve an element.'); } return { reason: 'SELECTOR_REGRESSION', evidence, confidence: 0.7, }; } if (matches(/(selector|locator|element|node).*(not found|failed|undefined)/i)) { evidence.push('Automation reported a missing selector or locator.'); return { reason: 'SELECTOR_REGRESSION', evidence, confidence: 0.65, }; } if (matches(/timed out|timeout|wait.*exceeded|waiting for/i) || matches(/promise.*did not resolve/i)) { evidence.push('Timeout or waiting condition was detected in the failure.'); return { reason: 'TIMING_OR_SYNCHRONISATION', evidence, confidence: 0.6, }; } if (matches(/expect(ed)?|AssertionError|toEqual|toBe|received|expected/i) && !matches(/network|timeout/)) { evidence.push('Assertion mismatch detected in error details.'); return { reason: 'ASSERTION_DRIFT', evidence, confidence: 0.55, }; } if (matches(/401|403|unauthori[sz]ed|forbidden|login|credential|token/i)) { evidence.push('Authentication-related error message detected.'); return { reason: 'AUTHENTICATION_FAILURE', evidence, confidence: 0.6, }; } if (matches(/env(var|iron)|environment variable|configuration|config/i) || matches(/missing .*config|misconfig/i)) { evidence.push('Environment configuration issue referenced in failure text.'); return { reason: 'ENVIRONMENT_CONFIGURATION', evidence, confidence: 0.55, }; } if (matches(/test data|fixture|seed data|record not found|no data/i) || matches(/entity.*not found/)) { evidence.push('Missing or stale test data referenced.'); return { reason: 'TEST_DATA_UNAVAILABLE', evidence, confidence: 0.55, }; } if (matches(/ECONN|ENOTFOUND|EAI_AGAIN|network|socket hang up|connection/i) || matches(/502|503|504|gateway|dns/i)) { evidence.push('Network or dependency outage detected.'); return { reason: 'NETWORK_OR_DEPENDENCY', evidence, confidence: 0.6, }; } if (matches(/500|internal server error|TypeError|ReferenceError|Unhandled/i)) { evidence.push('Application-side error or exception detected.'); return { reason: 'APPLICATION_DEFECT', evidence, confidence: 0.6, }; } if (combinedText.trim().length > 0) { evidence.push('Falling back to automation script issue from generic error content.'); return { reason: 'AUTOMATION_SCRIPT_ISSUE', evidence, confidence: 0.4, }; } evidence.push('No diagnostic text available, marking as unknown.'); return { reason: 'UNKNOWN', evidence, confidence: 0.2, }; } const PAGE_AI_STACK_MARKERS = [ 'page.ai', 'pageairunner', 'pageaiexception', 'locateexception', 'locateelement', 'donobuflow', 'donobuextendedpage.ai', ]; const PAGE_AI_TOOL_MARKERS = new Set([ AnalyzePageTextTool_1.AnalyzePageTextTool.NAME, SummarizeLearningsTool_1.SummarizeLearningsTool.NAME, MarkObjectiveCompleteTool_1.MarkObjectiveCompleteTool.NAME, MarkObjectiveNotCompletableTool_1.MarkObjectiveNotCompletableTool.NAME, ]); /** * Detects whether the failure manifested during Donobu's autonomous page.ai * routines, signalling that cached selectors or AI-driven steps may need resets. */ function didFailureOccurDuringPageAi(errorSummaries, toolCalls) { const stackIndicator = errorSummaries.some((err) => { const blob = `${err.message ?? ''}\n${err.stack ?? ''}`.toLowerCase(); return PAGE_AI_STACK_MARKERS.some((marker) => blob.includes(marker)); }); if (stackIndicator) { return true; } return toolCalls.some((tc) => PAGE_AI_TOOL_MARKERS.has(tc.toolName)); } /** * Analyzes multiple signals to determine if the failure is likely caused by stale * page.ai instruction cache versus a legitimate test failure. This nuanced detection * helps differentiate between: * - Stale cache: cached actions succeeded but were semantically wrong (clicked wrong elements) * - Legitimate failure: cache was correct, but assertions reveal real issues * * The hardest case: page.ai uses stale cache, actions succeed (selectors still exist), * but the page was redesigned so we're interacting with wrong elements. This manifests * as successful page.ai execution followed by assertion failures about unexpected state. */ function analyzeStaleCacheIndicators(testInfo, errorSummaries, toolCalls, flowMetadata) { // Check if the flow ran in DETERMINISTIC mode (meaning cache was used) const usedDeterministicMode = flowMetadata?.runMode === 'DETERMINISTIC'; // Check if this is a retry attempt (cache would have been invalidated) const isRetryAttempt = testInfo.retry > 0; // Check if selector or locate issues occurred during page.ai tool execution const locateFailedInStack = errorSummaries.some((err) => { const blob = `${err.message ?? ''}\n${err.stack ?? ''}`.toLowerCase(); return /locateexception|failed to locate element/i.test(blob); }); const selectorFailedDuringPageAi = locateFailedInStack || (didFailureOccurDuringPageAi(errorSummaries, toolCalls) && toolCalls.some((tc) => { const isPageAiTool = PAGE_AI_TOOL_MARKERS.has(tc.toolName); const hasSelectorIssue = !tc.success && /(selector|locator|element|node).*(not found|failed|undefined)/i.test(tc.outcomeSummary); return isPageAiTool && hasSelectorIssue; })); // Check if tool calls show selector issues (more broadly) const toolCallsShowSelectorIssues = toolCalls.some((tc) => !tc.success && /(selector|locator|element|node).*(not found|failed|undefined|visible|attached)/i.test(tc.outcomeSummary)); // Check for quick failure pattern (DETERMINISTIC mode failures are typically fast) // When cache is stale, the first cached action often fails quickly const quickFailurePattern = usedDeterministicMode && testInfo.duration < 5000 && // Failed in less than 5 seconds toolCalls.length > 0 && toolCalls.length < 5; // Few tool calls before failure // Check if page.ai completed successfully but subsequent assertions failed const pageAiToolCalls = toolCalls.filter((tc) => PAGE_AI_TOOL_MARKERS.has(tc.toolName)); const hasPageAiCalls = pageAiToolCalls.length > 0; const allPageAiCallsSucceeded = hasPageAiCalls && pageAiToolCalls.every((tc) => tc.success); const hasPostPageAiFailure = errorSummaries.length > 0 && errorSummaries.some((err) => { const blob = `${err.message ?? ''}\n${err.stack ?? ''}`.toLowerCase(); return (/expect(ed)?|assertion/i.test(blob) && !PAGE_AI_STACK_MARKERS.some((marker) => blob.includes(marker))); }); const assertionsFailedAfterSuccessfulPageAi = allPageAiCallsSucceeded && hasPostPageAiFailure; // Check if failure occurred after page.ai completed (not during) const failedAfterPageAiCompleted = hasPageAiCalls && !didFailureOccurDuringPageAi(errorSummaries, toolCalls) && allPageAiCallsSucceeded; return { usedDeterministicMode, selectorFailedDuringPageAi, failedAfterPageAiCompleted, isRetryAttempt, quickFailurePattern, toolCallsShowSelectorIssues, assertionsFailedAfterSuccessfulPageAi, }; } /** * Maps each failure reason to downstream orchestration attributes and adapts * them when the failure happened during page.ai execution. */ function reasonAttributesFor(reason, context) { const { occurredDuringPageAi } = context; const base = { UNKNOWN: { shouldRetry: true, requiresCodeChange: false, requiresProductFix: false, }, AUTOMATION_SCRIPT_ISSUE: { shouldRetry: false, requiresCodeChange: true, requiresProductFix: false, }, SELECTOR_REGRESSION: { shouldRetr