donobu
Version:
Create browser automations with an LLM agent and replay them as Playwright scripts.
1,168 lines • 85.5 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.TRIAGE_PERSISTENCE_FILE_IDS = exports.TreatmentPlan = exports.RemediationStepSchema = exports.FailureReasonSchema = exports.AdditionalDataRequestSchema = void 0;
exports.deriveHeuristicAssessment = deriveHeuristicAssessment;
exports.deriveHistoricalSignals = deriveHistoricalSignals;
exports.gatherTestFailureEvidence = gatherTestFailureEvidence;
exports.generateTreatmentPlanFromEvidence = generateTreatmentPlanFromEvidence;
exports.reconcileTreatmentPlan = reconcileTreatmentPlan;
exports.summarizeFlowHistory = summarizeFlowHistory;
const crypto_1 = require("crypto");
const fs = __importStar(require("fs/promises"));
const path = __importStar(require("path"));
const typescript_1 = __importDefault(require("typescript"));
const v4_1 = require("zod/v4");
const AnalyzePageTextTool_1 = require("../../../tools/AnalyzePageTextTool");
const MarkObjectiveCompleteTool_1 = require("../../../tools/MarkObjectiveCompleteTool");
const MarkObjectiveNotCompletableTool_1 = require("../../../tools/MarkObjectiveNotCompletableTool");
const SummarizeLearningsTool_1 = require("../../../tools/SummarizeLearningsTool");
const Logger_1 = require("../../../utils/Logger");
const cacheLocator_1 = require("../../ai/cache/cacheLocator");
/**
* # Test Failure Triage System
*
* Transforms Playwright test failures in Donobu-powered test suites into structured,
* actionable **treatment plans**. A treatment plan tells both humans and automation *why*
* the test failed, *how confident* the system is in that diagnosis, and *what to do next*
* — whether that is retrying the automation, deleting a stale page.ai cache, filing a
* product bug, or updating selectors in the test code.
*
* ---
*
* ## Architecture Overview
*
* The system operates in two phases that run in sequence:
*
* ### Phase 1 — Evidence Collection (`gatherTestFailureEvidence`)
*
* Called automatically by the Donobu test extension (`testExtension.ts`) in the
* Playwright `afterEach` hook whenever a test fails. This phase:
*
* 1. Extracts error messages, stack traces, and assertion details from `TestInfo`.
* 2. Loads the Donobu flow metadata (objective, run mode, state) and recent tool call
* history from the persistence layer.
* 3. Fetches **historical runs** of the same flow (by name) from the flows manager to
* detect flakiness, regression patterns, and prior self-heal success.
* 4. Captures the **failure screenshot** (last tool call screenshot from the current
* run) and the **baseline screenshot** (last tool call screenshot from the most
* recent successful historical run) for visual comparison.
* 5. Reads the source of the failing test case for contextual grounding.
* 6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
* rule-based pattern matching over errors, tool calls, stale-cache indicators,
* and historical signals to produce a preliminary diagnosis — including a failure
* reason, confidence score, and retry recommendation.
* 7. Persists the complete evidence bundle (JSON + screenshots) to disk as a
* `FailureEvidenceRecord`.
*
* ### Phase 2 — Treatment Plan Generation (`generateTreatmentPlanFromEvidence`)
*
* Called by the Donobu CLI (`donobu-cli.ts`) after evidence files are collected. This
* phase reads the persisted evidence and **requires a GPT client** — there is no
* heuristic-only fallback path. It:
*
* 1. Sends the full evidence bundle — including screenshots as vision input — to a
* GPT model with a detailed system prompt, requesting a structured `TreatmentPlan`
* response.
* 2. **Reconciles** the GPT plan with heuristic signals (`reconcileTreatmentPlan`) to
* enforce invariants the LLM might miss (e.g., forcing `shouldRetryAutomation` when
* historical data shows prior self-heal success, or overriding retry-step priority
* for stale-cache scenarios).
* 3. Returns the final `TreatmentPlan` for the CLI to act on — potentially triggering
* automatic retries, cache deletion, or surfacing remediation steps to the engineer.
*
* ---
*
* ## Data Signals
*
* The triage system draws from several complementary data sources, each targeting
* different failure modes:
*
* | Signal | Source | What it reveals |
* |-------------------------|-------------------------------|----------------------------------------------------|
* | Error messages & stacks | `TestInfo.errors` | Direct cause (assertion, timeout, selector) |
* | Tool call history | `FlowsPersistence` | What actions the AI took and their outcomes |
* | Tool call parameters | `ToolCall.parameters` | Exact selectors, URLs, and inputs attempted |
* | Flow metadata | `DonobuExtendedPage._dnb` | Run mode, objective, allowed tools, timing |
* | Stale cache indicators | Derived from above | Whether page.ai cache staleness is the root cause |
* | Historical flow runs | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal |
* | Failure screenshot | Last tool call screenshot | Visual state of the page when the failure occurred |
* | Baseline screenshot | Last successful run's screenshot | Visual reference for what the page *should* look like |
* | Test source snippet | TypeScript AST parsing | The test's expectations and structure |
*
* ---
*
* ## Failure Classification
*
* Every treatment plan assigns one of the following failure reasons:
*
* - `SELECTOR_REGRESSION` — UI locators have gone stale.
* - `STALE_CACHE_OR_INSTRUCTIONS` — The page.ai deterministic cache is outdated.
* - `TIMING_OR_SYNCHRONISATION` — Race conditions, slow loads, or flaky waits.
* - `NETWORK_OR_DEPENDENCY` — External service failures or connectivity issues.
* - `APPLICATION_DEFECT` — A real bug in the product under test.
* - `ASSERTION_DRIFT` — Test expectations no longer match valid application behavior.
* - `AUTOMATION_SCRIPT_ISSUE` — The test script itself is incorrect.
* - `AUTHENTICATION_FAILURE` — Session/auth problems prevented the test from running.
* - `ENVIRONMENT_CONFIGURATION` — Infrastructure or environment misconfiguration.
* - `TEST_DATA_UNAVAILABLE` — Required test data is missing or invalid.
* - `UNKNOWN` — Insufficient signal to determine the cause.
*
* ---
*
* ## Getting the Most Out of This System
*
* ### 1. Name your flows consistently
*
* Historical analysis works by matching flows by name. If every test uses a unique,
* stable flow name, the system can compare the current failure against all prior runs
* of the same flow and detect flakiness, regressions, and self-heal patterns:
*
* ```ts
* test('checkout flow adds item and completes purchase', async ({ page }) => {
* const ai = await page.ai('Checkout — add item and purchase', { ... });
* // ...
* });
* ```
*
* ### 2. Let evidence persist to disk
*
* The default behavior writes evidence JSON and screenshots to the run directory. This
* enables the CLI's Phase 2 to enrich the diagnosis with GPT and visual comparison.
* Ensure `DONOBU_TRIAGE_DISABLED` is not set, and that `runDirectory` is writable:
*
* ```ts
* // Evidence is gathered automatically on failure — no extra code needed.
* // To customize the output directory:
* await gatherTestFailureEvidence(testInfo, page, {
* runDirectory: '/path/to/custom/output',
* });
* ```
*
* ### 3. Ensure a GPT client is available
*
* A GPT client is **required** for treatment plan generation. The LLM performs semantic
* reasoning: it reads the test source, interprets tool call parameters, compares
* screenshots visually, and produces human-readable remediation steps. The CLI
* instantiates a GPT client automatically from configured credentials.
*
* ### 4. Use deterministic (cached) mode for stable flows
*
* When flows run in `DETERMINISTIC` mode with a page.ai cache, the triage system
* activates its stale-cache detection pipeline — a composite scoring system that
* weighs whether the cached instructions have gone stale versus whether the failure
* is a legitimate test issue. This is the system's strongest diagnostic capability.
*
* ### 5. Inspect the evidence files
*
* Each failure produces a `failure-evidence-<id>.json` file (plus optional PNG
* screenshots) in the run directory. These files are self-contained and can be
* re-processed, shared for debugging, or fed back into `generateTreatmentPlanFromEvidence`
* independently.
*
* ---
*
* ## Key Exports
*
* - `gatherTestFailureEvidence` — Phase 1 entry point. Call from a Playwright afterEach hook.
* - `generateTreatmentPlanFromEvidence` — Phase 2 entry point. Requires a `GptClient` and a
* `FailureEvidenceRecord`.
* - `TreatmentPlan` — The Zod schema defining the treatment plan structure.
* - `FailureReasonSchema` — The Zod enum of all possible failure classifications.
*/
const FailureReasonSchema = v4_1.z
.enum([
'UNKNOWN',
'AUTOMATION_SCRIPT_ISSUE',
'SELECTOR_REGRESSION',
'TIMING_OR_SYNCHRONISATION',
'ASSERTION_DRIFT',
'APPLICATION_DEFECT',
'AUTHENTICATION_FAILURE',
'ENVIRONMENT_CONFIGURATION',
'TEST_DATA_UNAVAILABLE',
'NETWORK_OR_DEPENDENCY',
])
.describe(`UNKNOWN: Triggered when no concrete root cause can be inferred.
AUTOMATION_SCRIPT_ISSUE: The scripted steps are incorrect or incomplete.
SELECTOR_REGRESSION: UI element locators have changed and need updates.
TIMING_OR_SYNCHRONISATION: Wait conditions or timing assumptions failed.
ASSERTION_DRIFT: The expected outcomes in assertions no longer match reality.
APPLICATION_DEFECT: The product behaviour is broken and must be fixed upstream.
AUTHENTICATION_FAILURE: Login, MFA, or session preconditions were not met.
ENVIRONMENT_CONFIGURATION: Test infra or env vars are misconfigured.
TEST_DATA_UNAVAILABLE: Seed data or fixtures are missing or expired.
NETWORK_OR_DEPENDENCY: External services or network connectivity failed.`);
exports.FailureReasonSchema = FailureReasonSchema;
const RemediationCategorySchema = v4_1.z
.enum([
'RETRY_AUTOMATION',
'UPDATE_TEST_LOGIC',
'UPDATE_SELECTORS',
'ADJUST_TIMING',
'REFINE_ASSERTIONS',
'FIX_APPLICATION',
'VALIDATE_AUTHENTICATION',
'CHECK_ENVIRONMENT',
'REFRESH_TEST_DATA',
'STABILIZE_DEPENDENCIES',
'ESCALATE_MANUAL_REVIEW',
'UNKNOWN',
])
.describe('Categorises the type of remediation that should be attempted so that downstream systems can pick appropriate playbooks.');
const RemediationStepSchema = v4_1.z.object({
category: RemediationCategorySchema,
summary: v4_1.z
.string()
.describe('Short actionable label describing what must happen next.'),
details: v4_1.z
.string()
.describe('Specific guidance for performing the remediation step.'),
});
exports.RemediationStepSchema = RemediationStepSchema;
const AdditionalDataRequestSchema = v4_1.z.object({
description: v4_1.z
.string()
.describe('Information that would materially help confirm the root cause.'),
suggestedSources: v4_1.z
.array(v4_1.z.string())
.describe('Where to look for the requested data.')
.default([]),
});
exports.AdditionalDataRequestSchema = AdditionalDataRequestSchema;
const AutomationDirectivesSchema = v4_1.z
.object({
clearPageAiCache: v4_1.z
.boolean()
.describe('When true, clear cached Page.AI selectors before attempting an automated retry.')
.optional(),
targetTestFile: v4_1.z
.string()
.describe('Relative path to the Playwright spec that should be re-run when applying this plan.')
.optional(),
targetProject: v4_1.z
.string()
.describe('Playwright project name that should be used when re-running automation for this failure.')
.optional(),
additionalPlaywrightArgs: v4_1.z
.array(v4_1.z.string())
.describe('Extra Playwright CLI arguments the orchestrator should append when applying this plan.')
.optional(),
})
.partial();
const TreatmentPlan = v4_1.z.object({
failureSummary: v4_1.z
.string()
.describe('A summary of the nature of the test failure'),
failureReason: FailureReasonSchema,
confidence: v4_1.z
.number()
.min(0)
.max(1)
.describe('Confidence score between 0 and 1 estimating how likely the failureReason classification is correct.'),
observedIndicators: v4_1.z
.array(v4_1.z.string())
.describe('Signals and observations that led to the triage conclusion.')
.default([]),
remediationSteps: v4_1.z
.array(RemediationStepSchema)
.describe('Ordered remediation steps to attempt.')
.default([]),
additionalDataRequests: v4_1.z
.array(AdditionalDataRequestSchema)
.describe('Extra context that would help if remediation stalls.')
.default([]),
shouldRetryAutomation: v4_1.z
.boolean()
.describe('True if the automation framework should attempt another run.'),
requiresCodeChange: v4_1.z
.boolean()
.describe('True when the Playwright test code likely needs updates.'),
requiresProductFix: v4_1.z
.boolean()
.describe('True when an application-level defect is suspected.'),
notes: v4_1.z
.string()
.describe('Optional free-form notes that do not fit the structured fields.')
.optional(),
automationDirectives: AutomationDirectivesSchema.optional(),
});
exports.TreatmentPlan = TreatmentPlan;
const MAX_SERIALIZED_STRING_LENGTH = 10000;
const MAX_TOOL_CALL_PARAMETERS_LENGTH = 500;
const MAX_TOOL_CALLS_TO_INCLUDE = 12;
const TRIAGE_EVIDENCE_SCHEMA_VERSION = 2;
const TRIAGE_PERSISTENCE_FILE_IDS = {
evidence: 'triage-evidence.json',
failureScreenshot: 'triage-failure-screenshot.png',
baselineScreenshot: 'triage-baseline-screenshot.png',
};
exports.TRIAGE_PERSISTENCE_FILE_IDS = TRIAGE_PERSISTENCE_FILE_IDS;
/**
* Ensures text blobs captured from tool calls or error messages fit within storage
* limits without losing useful context by padding or truncation artifacts.
*/
function truncateString(value, maxLength) {
if (value.length <= maxLength) {
return value;
}
if (maxLength <= 3) {
return value.slice(0, maxLength);
}
return `${value.slice(0, maxLength - 3)}...`;
}
/**
* Serialises arbitrary values while constraining nested strings and the overall
* payload so that GPT prompts and persistence never exceed downstream quotas.
*/
function safeStringify(value, maxLength, stringMaxLength = 500) {
try {
const serialized = JSON.stringify(value, (_, innerValue) => {
if (typeof innerValue === 'string') {
return truncateString(innerValue, stringMaxLength);
}
return innerValue;
}, 2);
return truncateString(serialized, maxLength);
}
catch (error) {
const err = error;
return `Failed to stringify: ${err.name}: ${err.message}`;
}
}
/**
* Strips sensitive or oversized fields from the persisted flow metadata so the
* triage agent receives only the contextual attributes it can safely reason over.
*/
function sanitizeFlowMetadata(metadata) {
if (!metadata) {
return null;
}
return {
id: metadata.id,
name: metadata.name,
runMode: metadata.runMode,
state: metadata.state,
targetWebsite: metadata.web?.targetWebsite ?? '',
overallObjective: metadata.overallObjective,
allowedTools: metadata.allowedTools,
envVars: metadata.envVars,
startedAt: metadata.startedAt,
completedAt: metadata.completedAt,
maxToolCalls: metadata.maxToolCalls,
gptConfigName: metadata.gptConfigName,
defaultMessageDuration: metadata.defaultMessageDuration,
resultSummary: metadata.result ? JSON.stringify(metadata.result) : null,
};
}
/**
* Condenses the Donobu tool invocation history so the most recent calls and their
* outcomes can influence the triage decision without overwhelming the prompt.
*/
function summarizeToolCalls(toolCalls) {
return toolCalls
.filter((toolCall) => toolCall.outcome !== null && toolCall.completedAt !== null)
.slice(-MAX_TOOL_CALLS_TO_INCLUDE)
.map((toolCall) => ({
id: toolCall.id,
toolName: toolCall.toolName,
success: toolCall.outcome.isSuccessful,
outcomeSummary: toolCall.outcome.forLlm,
durationMs: toolCall.completedAt - toolCall.startedAt,
page: toolCall.page,
startedAtIso: new Date(toolCall.startedAt).toISOString(),
completedAtIso: new Date(toolCall.completedAt).toISOString(),
parameters: safeStringify(toolCall.parameters, MAX_TOOL_CALL_PARAMETERS_LENGTH, 200),
}));
}
const HISTORY_QUERY_WINDOW_DAYS = 14;
const HISTORY_QUERY_LIMIT = 20;
const HISTORY_RECENT_RUNS_CAP = 10;
/**
* Compresses a set of historical flow runs into an aggregate summary compact
* enough for both heuristic reasoning and inclusion in GPT prompts.
*/
function summarizeFlowHistory(flowName, flows) {
const sorted = [...flows].sort((a, b) => (b.startedAt ?? 0) - (a.startedAt ?? 0));
let successCount = 0;
let failureCount = 0;
let otherCount = 0;
for (const flow of sorted) {
if (flow.state === 'SUCCESS') {
successCount++;
}
else if (flow.state === 'FAILED') {
failureCount++;
}
else {
otherCount++;
}
}
const totalRuns = sorted.length;
const passRate = totalRuns > 0 ? successCount / totalRuns : 0;
const recentRuns = sorted
.slice(0, HISTORY_RECENT_RUNS_CAP)
.map((f) => ({
id: f.id,
state: f.state,
runMode: f.runMode,
startedAt: f.startedAt,
completedAt: f.completedAt,
durationMs: f.startedAt !== null && f.completedAt !== null
? f.completedAt - f.startedAt
: null,
}));
let streakState = 'MIXED';
let streakLength = 0;
if (recentRuns.length > 0) {
const firstState = recentRuns[0].state;
if (firstState === 'SUCCESS' || firstState === 'FAILED') {
streakState = firstState;
streakLength = 1;
for (let i = 1; i < recentRuns.length; i++) {
if (recentRuns[i].state === firstState) {
streakLength++;
}
else {
break;
}
}
}
}
const lastSuccessfulRun = sorted.find((f) => f.state === 'SUCCESS');
return {
flowName,
totalRuns,
successCount,
failureCount,
otherCount,
passRate,
recentRuns,
currentStreak: { state: streakState, length: streakLength },
lastSuccessfulRunId: lastSuccessfulRun?.id ?? null,
queryWindowDays: HISTORY_QUERY_WINDOW_DAYS,
queriedAt: new Date().toISOString(),
};
}
/**
* Derives actionable signals from historical flow run data to feed into the
* heuristic classifier: flakiness, regression likelihood, prior self-heal
* success, and whether the page.ai cache was recently validated.
*/
function deriveHistoricalSignals(history) {
const { recentRuns, passRate } = history;
// Flakiness: proportion of state transitions in recent runs
let transitions = 0;
for (let i = 1; i < recentRuns.length; i++) {
if (recentRuns[i].state !== recentRuns[i - 1].state) {
transitions++;
}
}
const maxTransitions = Math.max(1, recentRuns.length - 1);
const flakinessScore = recentRuns.length > 1 ? transitions / maxTransitions : 0;
// Regression: consecutive failures at the head after passes existed
let regressionLikelihood = 0;
if (history.currentStreak.state === 'FAILED' && history.successCount > 0) {
const streakWeight = Math.min(history.currentStreak.length / 5, 1);
regressionLikelihood = Math.min(1, passRate * 0.6 + streakWeight * 0.4);
}
// Prior self-heal: a FAILED run immediately followed by SUCCESS (newest-first)
let priorSelfHealSuccess = false;
for (let i = 0; i < recentRuns.length - 1; i++) {
if (recentRuns[i].state === 'SUCCESS' &&
recentRuns[i + 1].state === 'FAILED') {
priorSelfHealSuccess = true;
break;
}
}
// Cache recently valid: any recent DETERMINISTIC SUCCESS
const cacheWasRecentlyValid = recentRuns.some((r) => r.runMode === 'DETERMINISTIC' && r.state === 'SUCCESS');
return {
flakinessScore,
regressionLikelihood,
recentPassRate: passRate,
priorSelfHealSuccess,
cacheWasRecentlyValid,
};
}
/**
* Fetches historical runs of the same flow by name and returns a compact
* summary. Runs concurrently with other evidence collection and fails open
* so triage proceeds even if the history lookup encounters errors.
*/
async function fetchFlowHistory(page) {
const flowName = page._dnb?.donobuFlowMetadata?.name;
if (!flowName) {
return null;
}
const flowsManager = page._dnb?.donobuStack?.flowsManager;
if (!flowsManager) {
return null;
}
const cutoffMs = Date.now() - HISTORY_QUERY_WINDOW_DAYS * 24 * 60 * 60 * 1000;
const currentFlowId = page._dnb.donobuFlowMetadata.id;
try {
const result = await flowsManager.getFlows({
name: flowName,
startedAfter: cutoffMs,
limit: HISTORY_QUERY_LIMIT,
});
const historicalFlows = result.items.filter((f) => f.id !== currentFlowId);
if (historicalFlows.length === 0) {
return null;
}
return summarizeFlowHistory(flowName, historicalFlows);
}
catch (error) {
Logger_1.appLogger.warn(`Failed to fetch historical flow data for "${flowName}", proceeding without history.`, error);
return null;
}
}
/**
* Retrieves the screenshot from the last completed tool call in the current flow.
* Returns the raw PNG/JPEG buffer if available, or null. Fails open so triage
* proceeds even if the screenshot cannot be loaded.
*/
async function fetchLastToolCallScreenshot(page) {
const flowId = page._dnb?.donobuFlowMetadata?.id;
const persistence = page._dnb?.persistence;
if (!flowId || !persistence) {
return null;
}
try {
const toolCalls = await persistence.getToolCalls(flowId);
if (toolCalls.length === 0) {
return null;
}
// Walk backwards to find the last tool call with a screenshot
for (let i = toolCalls.length - 1; i >= 0; i--) {
const screenshotId = toolCalls[i].postCallImageId;
if (screenshotId) {
return await persistence.getScreenShot(flowId, screenshotId);
}
}
return null;
}
catch (error) {
Logger_1.appLogger.debug(`Failed to fetch last tool call screenshot for flow ${flowId}.`, error);
return null;
}
}
/**
* Loads the final screenshot from a historical successful run to serve as a
* visual baseline for comparison with the current failure state. This enables
* the GPT triage agent to detect page redesigns and stale cache scenarios by
* comparing "what the page looked like when it last worked" vs "what it looks
* like now." Fails open — returns null if the screenshot cannot be retrieved.
*/
async function fetchBaselineScreenshot(page, historicalFlowId) {
const persistence = page._dnb?.persistence;
if (!persistence) {
return null;
}
try {
const toolCalls = await persistence.getToolCalls(historicalFlowId);
if (toolCalls.length === 0) {
return null;
}
for (let i = toolCalls.length - 1; i >= 0; i--) {
const screenshotId = toolCalls[i].postCallImageId;
if (screenshotId) {
return await persistence.getScreenShot(historicalFlowId, screenshotId);
}
}
return null;
}
catch (error) {
Logger_1.appLogger.debug(`Failed to fetch baseline screenshot from historical flow ${historicalFlowId}.`, error);
return null;
}
}
/**
* Loads the failing Playwright test file and extracts the statement block that
* defines the target test case so the triage agent can corroborate expectations.
*/
async function extractTestCaseSnippet(testFilePath, testName) {
if (!testFilePath) {
return null;
}
try {
const sourceCode = await fs.readFile(testFilePath, 'utf8');
const sourceFile = typescript_1.default.createSourceFile(testFilePath, sourceCode, typescript_1.default.ScriptTarget.Latest, true);
let snippet = null;
const visit = (node) => {
if (snippet) {
return;
}
if (typescript_1.default.isExpressionStatement(node) &&
typescript_1.default.isCallExpression(node.expression)) {
const expression = node.expression.expression;
if ((typescript_1.default.isIdentifier(expression) &&
(expression.text === 'test' || expression.text === 'it')) ||
(typescript_1.default.isPropertyAccessExpression(expression) &&
typescript_1.default.isIdentifier(expression.expression) &&
expression.expression.text === 'test')) {
const args = node.expression.arguments;
if (args.length > 0 && typescript_1.default.isStringLiteral(args[0])) {
const title = args[0].text;
if (title === testName ||
testName.includes(title) ||
title.includes(testName)) {
snippet = sourceCode.substring(node.pos, node.end).trim();
return;
}
}
}
}
typescript_1.default.forEachChild(node, visit);
};
visit(sourceFile);
if (!snippet) {
return null;
}
return truncateString(snippet, MAX_SERIALIZED_STRING_LENGTH);
}
catch (error) {
Logger_1.appLogger.warn(`Failed to extract test case snippet from ${testFilePath}`, error);
return null;
}
}
/**
* Normalises the heterogeneous error structures Playwright can emit into
* concise summaries that the LLM can rank and cross-reference with history.
*/
function buildErrorSummaries(testInfo) {
const rawErrors = (testInfo.errors ?? []).length > 0
? testInfo.errors
: testInfo.error
? [testInfo.error]
: [];
return rawErrors.map((err) => {
const summary = {};
if (typeof err?.message === 'string') {
summary.message = truncateString(err.message, 2000);
}
if (typeof err?.stack === 'string') {
summary.stack = truncateString(err.stack, 2000);
}
if (err?.name) {
summary.name = String(err.name);
}
if (err?.value !== undefined) {
summary.value = safeStringify(err.value, 2000);
}
if (err?.actual !== undefined) {
summary.actual = safeStringify(err.actual, 1000);
}
if (err?.expected !== undefined) {
summary.expected = safeStringify(err.expected, 1000);
}
if (err?.location) {
summary.location = safeStringify(err.location, 500);
}
if (typeof err?.snippet === 'string') {
summary.snippet = truncateString(err.snippet, 1000);
}
return summary;
});
}
/**
* Translates an inferred failure reason into a sequenced set of remediation
* actions so downstream automation and humans receive concrete next steps.
*/
function remediationStepsForReason(reason, context = {}) {
switch (reason) {
case 'AUTOMATION_SCRIPT_ISSUE':
return [
{
category: 'UPDATE_TEST_LOGIC',
summary: 'Inspect the failing automation logic.',
details: `Review the Playwright test and any Donobu tool invocations around the failure.
Align the scripted steps with the intended business flow.`,
},
{
category: 'RETRY_AUTOMATION',
summary: 'Retry after updating the automation.',
details: 'Re-run the test or Donobu flow once the script adjustments are in place to validate the fix.',
},
];
case 'SELECTOR_REGRESSION':
if (context.occurredDuringPageAi) {
return [
{
category: 'RETRY_AUTOMATION',
summary: 'Delete the test cache and retry the test.',
details: `Delete the cached ${cacheLocator_1.PAGE_AI_CACHE_DIRNAME}/<spec-file>${cacheLocator_1.PAGE_AI_CACHE_FILE_EXTENSION} entry for this test so page.ai recalculates selectors against the live DOM,
then rerun the automation to verify recovery.`,
},
{
category: 'UPDATE_SELECTORS',
summary: 'Update selectors if the autonomous retry still fails.',
details: 'If cache invalidation and autonomous retry still fail, fall back to manually adjusting the selector strategy.',
},
];
}
else {
return [
{
category: 'UPDATE_SELECTORS',
summary: 'Refresh selectors for the affected elements.',
details: 'Use page.find failovers or Playwright locators to update the targeting strategy for the broken element.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Validate selectors by re-running the test.',
details: 'Execute the test or self-healing run to confirm the new selectors resolve the regression.',
},
];
}
case 'TIMING_OR_SYNCHRONISATION':
return [
{
category: 'ADJUST_TIMING',
summary: 'Stabilise async waits and retry logic.',
details: 'Add explicit waits, polling, or guard conditions so the automation aligns with the application response times.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Run the test after timing adjustments.',
details: 'Execute an automation retry to ensure the timing changes eliminate the flake.',
},
];
case 'ASSERTION_DRIFT':
return [
{
category: 'REFINE_ASSERTIONS',
summary: 'Revisit expected outcomes and test assertions.',
details: 'Cross-check the assertion expectations against the latest product behaviour and update the checks accordingly.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Confirm updated assertions.',
details: 'Execute the test again once assertions have been updated to verify alignment with the application.',
},
];
case 'APPLICATION_DEFECT':
return [
{
category: 'FIX_APPLICATION',
summary: 'Log and prioritise the suspected product defect.',
details: 'Capture reproduction steps using the failing automation and escalate to the owning development team.',
},
{
category: 'ESCALATE_MANUAL_REVIEW',
summary: 'Coordinate QA verification of the fix.',
details: 'Have QA validate the defect manually and confirm once the product change is deployed.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Re-run automation after the product fix.',
details: 'Execute the test to confirm the application change resolves the failure.',
},
];
case 'AUTHENTICATION_FAILURE':
return [
{
category: 'VALIDATE_AUTHENTICATION',
summary: 'Verify credentials and auth flows.',
details: 'Check login secrets, MFA configuration, and session state preconditions for the test environment.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Run after auth prerequisites are restored.',
details: 'Execute the test once authentication is confirmed to be working.',
},
];
case 'ENVIRONMENT_CONFIGURATION':
return [
{
category: 'CHECK_ENVIRONMENT',
summary: 'Inspect environment and configuration.',
details: 'Validate environment variables, feature flags, and infrastructure dependencies referenced by the test.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Re-run once environment is stable.',
details: 'Execute automation after configuration corrections to confirm stability.',
},
];
case 'TEST_DATA_UNAVAILABLE':
return [
{
category: 'REFRESH_TEST_DATA',
summary: 'Restore or seed required test data.',
details: 'Populate fixtures, reset accounts, or refresh records relied upon by the test.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Run after data restoration.',
details: 'Execute automation with the refreshed data to ensure the flow passes.',
},
];
case 'NETWORK_OR_DEPENDENCY':
return [
{
category: 'STABILIZE_DEPENDENCIES',
summary: 'Check external services or network health.',
details: 'Verify the availability and latency of downstream services, APIs, or network connections.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Retry once dependencies recover.',
details: 'Re-run the test when network conditions or dependency status return to normal.',
},
];
case 'UNKNOWN':
default:
return [
{
category: 'ESCALATE_MANUAL_REVIEW',
summary: 'Perform deeper manual triage.',
details: 'Inspect Playwright traces, Donobu tool history, and application logs to narrow down the root cause.',
},
{
category: 'RETRY_AUTOMATION',
summary: 'Retry once additional context is gathered.',
details: 'After manual analysis, attempt another automation run to see if the issue reproduces consistently.',
},
];
}
}
/**
* Specifies follow-up context the triage agent should ask for when evidence is
* thin, keeping human responders focused on the data that unblocks a fix fastest.
*/
function additionalDataRequestsForReason(reason, _context = {}) {
switch (reason) {
case 'SELECTOR_REGRESSION':
return [
{
description: 'Collect DOM snapshots or screenshots around the failing selector.',
suggestedSources: [
'Playwright trace viewer',
'Donobu tool call screenshots',
],
},
];
case 'TIMING_OR_SYNCHRONISATION':
return [
{
description: 'Gather network and performance timings for the affected actions.',
suggestedSources: [
'Browser devtools performance logs',
'Backend request metrics',
],
},
];
case 'APPLICATION_DEFECT':
return [
{
description: 'Capture backend logs or Sentry events around the failure window.',
suggestedSources: ['Application logging platform', 'APM traces'],
},
];
case 'AUTHENTICATION_FAILURE':
return [
{
description: 'Validate authentication tokens and secrets used by the test.',
suggestedSources: ['Secret manager', 'Identity provider logs'],
},
];
case 'ENVIRONMENT_CONFIGURATION':
return [
{
description: 'Review environment variable values and feature flag states.',
suggestedSources: ['Deployment configuration', 'Infra dashboards'],
},
];
case 'TEST_DATA_UNAVAILABLE':
return [
{
description: 'Check the lifecycle of the test accounts or fixtures.',
suggestedSources: [
'Test data management system',
'Database snapshots',
],
},
];
case 'NETWORK_OR_DEPENDENCY':
return [
{
description: 'Inspect dependency uptime and recent incidents.',
suggestedSources: ['Status pages', 'Network monitoring dashboards'],
},
];
default:
return [
{
description: 'Review Playwright trace, Donobu flow metadata, and browser console logs.',
suggestedSources: [
'Playwright trace viewer',
'Donobu persistence layer',
],
},
];
}
}
/**
* Applies lightweight heuristics across Playwright errors and Donobu tool logs
* to produce a first-pass failure classification and supporting evidence trail.
*/
function inferFailureReason(errorSummaries, toolCalls) {
const combinedText = [
...errorSummaries.map((err) => err.message ?? ''),
...errorSummaries.map((err) => err.stack ?? ''),
...toolCalls.map((tc) => tc.outcomeSummary),
]
.filter(Boolean)
.join('\n')
.toLowerCase();
const evidence = [];
const matches = (pattern) => pattern.test(combinedText);
// LocateException: AI-powered element location failed
if (matches(/locateexception|failed to locate element/i)) {
if (matches(/no.?matches/i)) {
evidence.push('page.ai.locate() could not find any element matching the description.');
}
else if (matches(/too.?many.?matches/i)) {
evidence.push('page.ai.locate() matched too many elements and could not disambiguate.');
}
else {
evidence.push('page.ai.locate() failed to resolve an element.');
}
return {
reason: 'SELECTOR_REGRESSION',
evidence,
confidence: 0.7,
};
}
if (matches(/(selector|locator|element|node).*(not found|failed|undefined)/i)) {
evidence.push('Automation reported a missing selector or locator.');
return {
reason: 'SELECTOR_REGRESSION',
evidence,
confidence: 0.65,
};
}
if (matches(/timed out|timeout|wait.*exceeded|waiting for/i) ||
matches(/promise.*did not resolve/i)) {
evidence.push('Timeout or waiting condition was detected in the failure.');
return {
reason: 'TIMING_OR_SYNCHRONISATION',
evidence,
confidence: 0.6,
};
}
if (matches(/expect(ed)?|AssertionError|toEqual|toBe|received|expected/i) &&
!matches(/network|timeout/)) {
evidence.push('Assertion mismatch detected in error details.');
return {
reason: 'ASSERTION_DRIFT',
evidence,
confidence: 0.55,
};
}
if (matches(/401|403|unauthori[sz]ed|forbidden|login|credential|token/i)) {
evidence.push('Authentication-related error message detected.');
return {
reason: 'AUTHENTICATION_FAILURE',
evidence,
confidence: 0.6,
};
}
if (matches(/env(var|iron)|environment variable|configuration|config/i) ||
matches(/missing .*config|misconfig/i)) {
evidence.push('Environment configuration issue referenced in failure text.');
return {
reason: 'ENVIRONMENT_CONFIGURATION',
evidence,
confidence: 0.55,
};
}
if (matches(/test data|fixture|seed data|record not found|no data/i) ||
matches(/entity.*not found/)) {
evidence.push('Missing or stale test data referenced.');
return {
reason: 'TEST_DATA_UNAVAILABLE',
evidence,
confidence: 0.55,
};
}
if (matches(/ECONN|ENOTFOUND|EAI_AGAIN|network|socket hang up|connection/i) ||
matches(/502|503|504|gateway|dns/i)) {
evidence.push('Network or dependency outage detected.');
return {
reason: 'NETWORK_OR_DEPENDENCY',
evidence,
confidence: 0.6,
};
}
if (matches(/500|internal server error|TypeError|ReferenceError|Unhandled/i)) {
evidence.push('Application-side error or exception detected.');
return {
reason: 'APPLICATION_DEFECT',
evidence,
confidence: 0.6,
};
}
if (combinedText.trim().length > 0) {
evidence.push('Falling back to automation script issue from generic error content.');
return {
reason: 'AUTOMATION_SCRIPT_ISSUE',
evidence,
confidence: 0.4,
};
}
evidence.push('No diagnostic text available, marking as unknown.');
return {
reason: 'UNKNOWN',
evidence,
confidence: 0.2,
};
}
const PAGE_AI_STACK_MARKERS = [
'page.ai',
'pageairunner',
'pageaiexception',
'locateexception',
'locateelement',
'donobuflow',
'donobuextendedpage.ai',
];
const PAGE_AI_TOOL_MARKERS = new Set([
AnalyzePageTextTool_1.AnalyzePageTextTool.NAME,
SummarizeLearningsTool_1.SummarizeLearningsTool.NAME,
MarkObjectiveCompleteTool_1.MarkObjectiveCompleteTool.NAME,
MarkObjectiveNotCompletableTool_1.MarkObjectiveNotCompletableTool.NAME,
]);
/**
* Detects whether the failure manifested during Donobu's autonomous page.ai
* routines, signalling that cached selectors or AI-driven steps may need resets.
*/
function didFailureOccurDuringPageAi(errorSummaries, toolCalls) {
const stackIndicator = errorSummaries.some((err) => {
const blob = `${err.message ?? ''}\n${err.stack ?? ''}`.toLowerCase();
return PAGE_AI_STACK_MARKERS.some((marker) => blob.includes(marker));
});
if (stackIndicator) {
return true;
}
return toolCalls.some((tc) => PAGE_AI_TOOL_MARKERS.has(tc.toolName));
}
/**
* Analyzes multiple signals to determine if the failure is likely caused by stale
* page.ai instruction cache versus a legitimate test failure. This nuanced detection
* helps differentiate between:
* - Stale cache: cached actions succeeded but were semantically wrong (clicked wrong elements)
* - Legitimate failure: cache was correct, but assertions reveal real issues
*
* The hardest case: page.ai uses stale cache, actions succeed (selectors still exist),
* but the page was redesigned so we're interacting with wrong elements. This manifests
* as successful page.ai execution followed by assertion failures about unexpected state.
*/
function analyzeStaleCacheIndicators(testInfo, errorSummaries, toolCalls, flowMetadata) {
// Check if the flow ran in DETERMINISTIC mode (meaning cache was used)
const usedDeterministicMode = flowMetadata?.runMode === 'DETERMINISTIC';
// Check if this is a retry attempt (cache would have been invalidated)
const isRetryAttempt = testInfo.retry > 0;
// Check if selector or locate issues occurred during page.ai tool execution
const locateFailedInStack = errorSummaries.some((err) => {
const blob = `${err.message ?? ''}\n${err.stack ?? ''}`.toLowerCase();
return /locateexception|failed to locate element/i.test(blob);
});
const selectorFailedDuringPageAi = locateFailedInStack ||
(didFailureOccurDuringPageAi(errorSummaries, toolCalls) &&
toolCalls.some((tc) => {
const isPageAiTool = PAGE_AI_TOOL_MARKERS.has(tc.toolName);
const hasSelectorIssue = !tc.success &&
/(selector|locator|element|node).*(not found|failed|undefined)/i.test(tc.outcomeSummary);
return isPageAiTool && hasSelectorIssue;
}));
// Check if tool calls show selector issues (more broadly)
const toolCallsShowSelectorIssues = toolCalls.some((tc) => !tc.success &&
/(selector|locator|element|node).*(not found|failed|undefined|visible|attached)/i.test(tc.outcomeSummary));
// Check for quick failure pattern (DETERMINISTIC mode failures are typically fast)
// When cache is stale, the first cached action often fails quickly
const quickFailurePattern = usedDeterministicMode &&
testInfo.duration < 5000 && // Failed in less than 5 seconds
toolCalls.length > 0 &&
toolCalls.length < 5; // Few tool calls before failure
// Check if page.ai completed successfully but subsequent assertions failed
const pageAiToolCalls = toolCalls.filter((tc) => PAGE_AI_TOOL_MARKERS.has(tc.toolName));
const hasPageAiCalls = pageAiToolCalls.length > 0;
const allPageAiCallsSucceeded = hasPageAiCalls && pageAiToolCalls.every((tc) => tc.success);
const hasPostPageAiFailure = errorSummaries.length > 0 &&
errorSummaries.some((err) => {
const blob = `${err.message ?? ''}\n${err.stack ?? ''}`.toLowerCase();
return (/expect(ed)?|assertion/i.test(blob) &&
!PAGE_AI_STACK_MARKERS.some((marker) => blob.includes(marker)));
});
const assertionsFailedAfterSuccessfulPageAi = allPageAiCallsSucceeded && hasPostPageAiFailure;
// Check if failure occurred after page.ai completed (not during)
const failedAfterPageAiCompleted = hasPageAiCalls &&
!didFailureOccurDuringPageAi(errorSummaries, toolCalls) &&
allPageAiCallsSucceeded;
return {
usedDeterministicMode,
selectorFailedDuringPageAi,
failedAfterPageAiCompleted,
isRetryAttempt,
quickFailurePattern,
toolCallsShowSelectorIssues,
assertionsFailedAfterSuccessfulPageAi,
};
}
/**
* Maps each failure reason to downstream orchestration attributes and adapts
* them when the failure happened during page.ai execution.
*/
function reasonAttributesFor(reason, context) {
const { occurredDuringPageAi } = context;
const base = {
UNKNOWN: {
shouldRetry: true,
requiresCodeChange: false,
requiresProductFix: false,
},
AUTOMATION_SCRIPT_ISSUE: {
shouldRetry: false,
requiresCodeChange: true,
requiresProductFix: false,
},
SELECTOR_REGRESSION: {
shouldRetr