UNPKG

donobu

Version:

Create browser automations with an LLM agent and replay them as Playwright scripts.

658 lines 31.6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ReplayableInteraction = exports.AnnotationBasedSchema = exports.SelectorBasedSchema = void 0; const v4_1 = require("zod/v4"); const PageClosedException_1 = require("../exceptions/PageClosedException"); const ElementSelector_1 = require("../models/ElementSelector"); const ToolSchema_1 = require("../models/ToolSchema"); const Logger_1 = require("../utils/Logger"); const PlaywrightUtils_1 = require("../utils/PlaywrightUtils"); const TargetUtils_1 = require("../utils/TargetUtils"); const Tool_1 = require("./Tool"); /** * Schema containing the minimal, deterministic selector information * necessary to -locate an element on the page. * * Used when a human or an autonomous run has already produced an exact * `{ frame, element }` selector pair and we simply want to replay that * interaction without any involvement from an LLM. */ exports.SelectorBasedSchema = v4_1.z.object({ // WARNING! If you change this field name, you must also update 'prepareToolCallForRerun'! selector: ElementSelector_1.ElementSelectorSchema, }); /** * Schema used when the LLM chooses an element by its numbered Donobu * annotation. * * The LLM supplies: * * `annotation - the chosen annotation number rendered in the * Donobu overlay. * * `whyThisAnnotation - a natural-language explanation that is surfaced * to users (and logged) to aid debugging and transparency. * * During replay these two properties are stripped out and replaced by a * deterministic selector derived from the element actually found on-screen. */ exports.AnnotationBasedSchema = v4_1.z.object({ ...ToolSchema_1.BaseGptArgsSchema.shape, whyThisAnnotation: v4_1.z .string() .describe('The reason why this particular numbered annotation was chosen.'), annotation: v4_1.z .string() .describe('The numbered annotation of the element to click.'), }); /** * Abstract base class for all "selector-driven" browser interactions that * can be *replayed deterministically*. * * A concrete subclass represents a single kind of user action (e.g. * clicking, typing, etc). The class handles the tedious parts: * * 1. **Element discovery.** * From a set of candidate CSS selectors (and an optional frame selector) * it produces one or more `Locator` objects ordered by how specific they * are. The first uniquely matching locator is preferred. * * 2. **Resilience.** * When an element is missing, off-screen, or quickly detaches, the class * automatically falls back to the next best locator until all options are * exhausted. * * 3. **Replay support.** * Every successful call stores selector metadata inside the returned * {@link ToolCallResult}. The static {@link remapForRerun} helper can * later turn that metadata into a fresh {@link ProposedToolCall} so the * exact same element can be located in a future run without LLM input. * * ### Type Parameters * | Name | Meaning | * |----------------|----------------------------------------------------------------------------| * | `CoreSchema` | Zod schema describing *core* parameters consumed by the concrete tool | * | `NonGptSchema` | Schema for deterministic invocations (extends {@link SelectorBasedSchema}) | * | `GptSchema` | Schema for LLM-driven invocations (extends {@link AnnotationBasedSchema}) | * * Subclasses **only** need to implement {@link invoke}, which receives a * *resolved* locator and may perform the actual browser operation (e.g. * click, type, etc). */ class ReplayableInteraction extends Tool_1.Tool { constructor(name, description, coreSchema, inputSchema, inputSchemaForGpt, requiresGpt = false) { super(name, description, inputSchema, inputSchemaForGpt, requiresGpt, undefined, ['web']); this.coreSchema = coreSchema; } async call(context, parameters) { const page = (0, TargetUtils_1.webPage)(context); const locators = await ReplayableInteraction.getLocatorsOrderedByMatchCount(page, parameters.selector, ReplayableInteraction.MAX_LOCATOR_MATCH_COUNT, parameters.selector.element.length); if (!locators.length) { return { isSuccessful: false, forLlm: 'FAILED! Unable to resolve HTML element to operate with.', metadata: { selector: parameters.selector, }, }; } return this.callCore(context, parameters, locators, parameters.selector); } async callFromGpt(context, parameters) { const page = (0, TargetUtils_1.webPage)(context); const elementSelector = `[${context.targetInspector.interactableElementAttribute}="${parameters.annotation}"]`; let locatorData = null; for (const frame of page.frames()) { if (frame.isDetached()) { continue; } const locator = frame.locator(elementSelector); if ((await locator.count()) > 0) { const selectorCandidates = (await PlaywrightUtils_1.PlaywrightUtils.generateSelectors(locator)).slice(0, ReplayableInteraction.MAX_SELECTOR_FAILOVERS); const frameSelector = frame.parentFrame() === null ? null : (await PlaywrightUtils_1.PlaywrightUtils.generateSelectors(await frame.frameElement()))[0]; locatorData = { locators: [locator], selectorForReplay: { element: selectorCandidates, frame: frameSelector, }, }; break; } } if (!locatorData?.locators.length) { return { isSuccessful: false, forLlm: `FAILED! Unable to resolve HTML element for annotation '${parameters.annotation}'.`, metadata: { annotation: parameters.annotation, attemptedSelector: elementSelector, }, }; } return this.callCore(context, parameters, locatorData.locators.map((locator) => { return { selector: elementSelector, locator: locator, }; }), locatorData.selectorForReplay); } async callCore(context, parameters, selectorLocators, selectorForReplay) { // Extract core parameters using the stored schema const coreParameters = this.coreSchema.parse(parameters); const timeoutOpt = { timeout: 1000, // Millis }; const page = (0, TargetUtils_1.webPage)(context); const selectorAttempts = []; for (const selectorLocator of selectorLocators) { const attemptSummary = { selector: selectorLocator.selector, matchCount: null, errors: [], }; selectorAttempts.push(attemptSummary); try { const count = await selectorLocator.locator.count(); attemptSummary.matchCount = count; if (count === 0) { attemptSummary.errors.push('Locator resolved to 0 elements before interaction.'); } for (let i = 0; i < count; ++i) { try { const originalLocator = selectorLocator.locator.nth(i); const targetHandle = await originalLocator .first() .elementHandle(timeoutOpt); if (!targetHandle) { throw new Error(`Failed to resolve element handle for selector '${selectorLocator.selector}'`); } const labelLocator = await ReplayableInteraction.getLocatorOrItsLabel(originalLocator); let labelHandle; try { labelHandle = await labelLocator .first() .elementHandle(timeoutOpt); } catch { labelHandle = null; } const chosenHandle = labelHandle ?? targetHandle; // Check if element is still attached before attempting to do anything meaningful. const isAttached = await chosenHandle.evaluate((el) => el.isConnected); if (!isAttached) { throw new Error(`Element '${i}' for selector '${selectorLocator.selector}' does not exist in the DOM`); } let htmlSnippet = ''; try { htmlSnippet += await (0, TargetUtils_1.webInspector)(context).pageInspector.getHtmlSnippet(chosenHandle); } catch (error) { Logger_1.appLogger.warn('Failed to get HTML snippet', error); } const forLlm = await this.invoke(context, coreParameters, { target: targetHandle, label: labelHandle ?? undefined, }); await PlaywrightUtils_1.PlaywrightUtils.waitForPageStability(page); // Be careful to not report back the internal, ephemeral, Donobu selector // used for autonomous runs. If we are in autonomous mode, just report // back the top-resolved selector. const reportedSelector = !selectorLocator.selector.includes(context.targetInspector.interactableElementAttribute) ? selectorLocator.selector : selectorForReplay.element[0]; return { isSuccessful: true, forLlm: forLlm + htmlSnippet, metadata: { ...selectorForReplay, usedSelector: reportedSelector, }, }; } catch (elementError) { if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(elementError)) { throw elementError; } Logger_1.appLogger.error(`Failed to interact with element '${i}' for selector '${selectorLocator.selector}' due to exception, will fail over to remaining elements (if any)`, elementError); attemptSummary.errors.push(`element ${i}: ${ReplayableInteraction.describeError(elementError)}`); } } } catch (locatorError) { if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(locatorError)) { throw locatorError; } Logger_1.appLogger.error(`Failed to interact with selector '${selectorLocator.selector}' due to exception, will fail over to remaining selectors (if any)`, locatorError); attemptSummary.errors.push(`selector error: ${ReplayableInteraction.describeError(locatorError)}`); } } return { isSuccessful: false, forLlm: `FAILED! Unable to apply operation. ${ReplayableInteraction.summarizeAttemptsForLlm(selectorAttempts)}`, metadata: { selectorForReplay: selectorForReplay, selectorAttempts: selectorAttempts, }, }; } static describeError(error) { if (error instanceof Error) { return `${error.name}: ${error.message}`; } if (typeof error === 'string') { return error; } try { return JSON.stringify(error); } catch { return String(error); } } static summarizeAttemptsForLlm(attempts) { if (!attempts.length) { return 'No selector attempts were recorded.'; } const maxAttemptsToReport = 3; const summarized = attempts.slice(0, maxAttemptsToReport).map((attempt) => { const matchCount = attempt.matchCount === null ? 'unknown' : attempt.matchCount; const errors = attempt.errors.length > 0 ? ` errors: ${attempt.errors .slice(0, 2) .join('; ')}${attempt.errors.length > 2 ? '...' : ''}` : ''; return `'${attempt.selector}' (matches=${matchCount}${errors})`; }); const trimmed = attempts.length > maxAttemptsToReport ? ' (trimmed to first attempts)' : ''; return `Tried ${attempts.length} selector(s): ${summarized.join(' | ')}${trimmed}.`; } /** * Retrieves a list of {@link Locator} objects based on the provided selector * candidates, ordered by their match count in ascending order. If the match * count for a locator exceeds * {@link ReplayableInteraction.MAX_LOCATOR_MATCH_COUNT}, then they are ignored, as * it is considered too broad of a locator to be useful. * * This method iterates through a list of CSS selector candidates and creates * a {@link Locator} for each candidate. It counts the number of elements that * match each selector within a given page. If a frame selector is provided, * it looks for the elements within the specified frame; otherwise, it * searches within the entire page. Only locators with a positive match count * are added to the result list. * * @param page - The the web page to search within. * @return A list of {@link Locator} objects that have been found, ordered by * their match count in ascending order. */ static async getLocatorsOrderedByMatchCount(page, selector, maxLocatorMatchCount = ReplayableInteraction.MAX_LOCATOR_MATCH_COUNT, maxSelectorFailovers = ReplayableInteraction.MAX_SELECTOR_FAILOVERS) { if (selector.element.length === 0) { return []; } try { // Store the first selector promise separately let firstSelectorResolver = null; // Create a promise we can resolve early if the first selector is unique const firstSelectorUniqueMatch = new Promise((resolve) => { firstSelectorResolver = resolve; }); // Process all selectors in parallel const allSelectorsPromise = new Promise((resolve, reject) => { const locatorResults = []; // Create and execute all promises in parallel const promises = selector.element .map((selectorCandidate) => { return PlaywrightUtils_1.PlaywrightUtils.normalizeSelector(selectorCandidate); }) .map(async (selectorCandidate, index) => { try { const elementLocator = selector.frame ? page.frameLocator(selector.frame).locator(selectorCandidate) : page.locator(selectorCandidate); await elementLocator .waitFor({ state: 'attached', timeout: 2000, }) .catch(() => { // Continue if timeout }); const count = await elementLocator.count(); // Special case: first selector with exactly one match if (index === 0 && count === 1) { firstSelectorResolver([ { selector: selectorCandidate, locator: elementLocator }, ]); } if (count > 0 && maxLocatorMatchCount >= count) { locatorResults.push({ selector: selectorCandidate, locator: elementLocator, count, }); } } catch (e) { Logger_1.appLogger.warn(`Invalid selector: ${selectorCandidate}`, e); } }); // Wait for all selectors to be processed Promise.all(promises) .then(() => { // Sort and return the results resolve(locatorResults .sort((a, b) => { if (a.count !== b.count) { return a.count - b.count; } return (ReplayableInteraction.selectorTiebreakerPriority(a.selector) - ReplayableInteraction.selectorTiebreakerPriority(b.selector)); }) .map((item) => { return { selector: item.selector, locator: item.locator }; })); }) .catch(reject); }); // Race between the fast path and full processing const prelimiaryList = await Promise.race([ firstSelectorUniqueMatch, allSelectorsPromise, ]); return prelimiaryList.slice(0, maxSelectorFailovers); } catch (error) { if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(error)) { throw new PageClosedException_1.PageClosedException(); } else { throw error; } } } /** * Returns the given locator or its label, if it has one and is associated with a labelable element. * Labelable elements include: button, input (except hidden), meter, output, progress, select, and textarea. */ static async getLocatorOrItsLabel(element) { const timeoutMilliseconds = 5000; try { // First check if the element itself is visible try { await element.waitFor({ state: 'visible', timeout: timeoutMilliseconds, }); } catch (error) { // If the element itself isn't visible, just return it without looking for labels if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(error)) { throw new PageClosedException_1.PageClosedException(); } return element; } // Check if the element is a labelable element // See https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/label const isLabelable = await element.evaluate((elem) => { const tagName = elem.tagName.toLowerCase(); if (tagName === 'button' || tagName === 'meter' || tagName === 'output' || tagName === 'progress' || tagName === 'select' || tagName === 'textarea') { return true; } if (tagName === 'input') { // Input is labelable except when type="hidden" const type = elem.getAttribute('type')?.toLowerCase() || 'text'; return type !== 'hidden'; } return false; }); // If the element is not labelable, return the original locator if (!isLabelable) { return element; } // Try to get the ID attribute const id = await element .getAttribute('id', { timeout: timeoutMilliseconds, }) .catch(() => null); if (id) { try { // Get the underlying ElementHandle first const handle = await element.elementHandle({ timeout: timeoutMilliseconds, }); if (handle) { // From the handle, retrieve the frame that owns this element const frame = await handle.ownerFrame(); if (frame) { // Look for label with matching 'for' attribute const labelByFor = frame.locator(`label[for="${id}"]`); // Check if the label exists and is visible if ((await labelByFor.count()) > 0) { try { await labelByFor.waitFor({ state: 'visible', timeout: timeoutMilliseconds, }); return labelByFor; } catch (visibilityError) { // If label exists but isn't visible, continue to other checks if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(visibilityError)) { throw new PageClosedException_1.PageClosedException(); } } } } } } catch (error) { // Handle errors in the ElementHandle/frame retrieval, but continue to other checks if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(error)) { throw new PageClosedException_1.PageClosedException(); } } } // Fallback: look for a directly wrapping label (immediate ancestor) // Using 'xpath=parent::label' to only get direct parent if it's a label const directParentLabel = element.locator('xpath=parent::label'); if ((await directParentLabel.count()) > 0) { try { await directParentLabel.waitFor({ state: 'visible', timeout: timeoutMilliseconds, }); return directParentLabel; } catch (visibilityError) { if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(visibilityError)) { throw new PageClosedException_1.PageClosedException(); } } } // If direct parent isn't a label, check any ancestor label const wrappingLabel = element.locator('xpath=ancestor::label[1]'); // Get closest ancestor label if ((await wrappingLabel.count()) > 0) { try { await wrappingLabel.waitFor({ state: 'visible', timeout: timeoutMilliseconds, }); return wrappingLabel; } catch (visibilityError) { if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(visibilityError)) { throw new PageClosedException_1.PageClosedException(); } } } // Neither "for" nor wrapping label found or visible return element; } catch (error) { if (PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(error)) { throw new PageClosedException_1.PageClosedException(); } throw error; } } /** * Calculates a priority value for a CSS selector to be used in tiebreaker situations * when multiple selectors have the same match count. * * The function assigns lower numerical values to selectors that are generally more * reliable and specific for targeting elements in a user interface. */ static selectorTiebreakerPriority(sel) { // Convert selector to lowercase for case-insensitive matching of attribute names const lowerSel = sel.toLowerCase(); // Check for aria-label selectors (highest priority) if (/\[aria-label\s*=/.test(lowerSel) || /@aria-label/.test(lowerSel)) { return 0; } // Check for data-testid based selectors if (/\[(data-testid|data-test-id)\s*(=|~=|\^=|\$=|\*=)/.test(lowerSel) || /@testid/.test(lowerSel)) { return 1; } // Check for ID-based selectors if (/^#/.test(sel)) { return 2; } // Check for placeholder selectors if (/\[placeholder\s*=/.test(lowerSel) || /@placeholder/.test(lowerSel)) { return 3; } // Check for text-based XPath selectors if (PlaywrightUtils_1.PlaywrightUtils.isXpathSelector(sel)) { return 4; } // Any other selector types return 5 + sel.length; } /** * Helper function to check if a selector is ID-based */ static isIdBasedSelector(selector) { const trimmed = selector.trim(); const isHashId = /^#/.test(trimmed); const isAttrId = /\[id\s*=\s*['"]?[^\]]+['"]?\]/i.test(trimmed) || /@id\s*=\s*['"]?[^\]]+['"]?/i.test(trimmed); return isHashId || isAttrId; } /** * Helper function to check if a selector is aria-based */ static isAriaBasedSelector(selector) { const lowerSel = selector.toLowerCase(); return /\[aria-label\s*=/.test(lowerSel) || /@aria-label/.test(lowerSel); } /** * Transform a historical {@link ToolCall} into a replay-ready * {@link ProposedToolCall}. * * The original execution stored selector metadata inside * `toolCall.outcome.metadata`. This helper: * * 1. **Validates** that the metadata is present and still contains a * deterministic selector. * 2. **Builds** a new `parameters` object which: * * Moves the recorded selector into the expected `selector` slot. * * Removes any `annotation` / `whyThisAnnotation` properties (they are * meaningless for deterministic replays). * 3. **Returns** a ready-to-run {@link ProposedToolCall} that points to the * same tool (`toolName`) with the adjusted parameters. * * If the historical call did **not** record selector metadata, replaying * would be impossible and the method throws an `Error` so the caller can * handle this situation explicitly (e.g. fall back to a fresh LLM * invocation or surface an error to the user). * * @param toolCall - The historical {@link ToolCall}. * @param options - Behaviour switches: * * `areElementIdsVolatile` - If `true`, ID-only selectors * (e.g. `#submit-btn`) are *dropped* because the element's `id` * attribute is considered volatile. * When every candidate is ID-based the list is left unchanged * (replay is better than nothing). * * `useSelectorFailover` - If `false`, only the **first** selector * (the most specific one) is kept, disabling automatic fail-over to * broader selectors. * * @returns A {@link ProposedToolCall} that can be executed in a fresh run. * @throws Error if the original call lacks selector metadata. */ prepareForRerun(toolCall, options) { // If the tool call is from a GPT, we need the result metadata to continue. if (!('selector' in toolCall.parameters)) { if (!toolCall.outcome.metadata) { throw new Error(`Failed to prepare tool call for rerun: result metadata is missing. Original call: ${JSON.stringify(toolCall)}`); } else if (!toolCall.outcome.metadata.element) { throw new Error(`Failed to prepare tool call for rerun: selector data is absent in metadata. Original call: ${JSON.stringify(toolCall)}`); } } // Parse and clone the stored selector ------------------------------------ const originalSelector = 'selector' in toolCall.parameters ? ElementSelector_1.ElementSelectorSchema.parse(toolCall.parameters.selector) : ElementSelector_1.ElementSelectorSchema.parse(toolCall.outcome.metadata); let selectorCandidates = [...originalSelector.element]; // 1) Drop ID-based selectors if requested -------------------------------- if (options.areElementIdsVolatile) { const nonId = selectorCandidates.filter((sel) => { return !ReplayableInteraction.isIdBasedSelector(sel); }); // Use the filtered list only if something remains; otherwise keep the // original list to avoid total selector loss. if (nonId.length > 0) { selectorCandidates = nonId; } } else { // 2) Promote ID-based selectors to the top if areElementIdsVolatile is not true // Exception: Do not promote ID selectors above aria-based selectors const firstSelectorIsAria = selectorCandidates.length > 0 && ReplayableInteraction.isAriaBasedSelector(selectorCandidates[0]); if (!firstSelectorIsAria) { const idBasedSelectors = selectorCandidates.filter((sel) => ReplayableInteraction.isIdBasedSelector(sel)); const nonIdBasedSelectors = selectorCandidates.filter((sel) => !ReplayableInteraction.isIdBasedSelector(sel)); // Reorder: ID-based selectors first, then non-ID-based selectors if (idBasedSelectors.length > 0) { selectorCandidates = [...idBasedSelectors, ...nonIdBasedSelectors]; } } // If first selector is aria-based, leave the order unchanged } // 3) Disable fail-over if requested -------------------------------------- if (options.disableSelectorFailover && selectorCandidates.length > 1) { selectorCandidates = [selectorCandidates[0]]; } const modifiedSelector = { element: selectorCandidates, ...(originalSelector.frame ? { frame: originalSelector.frame } : {}), }; // Build the new parameter object ---------------------------------------- const revisedArgs = { ...toolCall.parameters, selector: modifiedSelector, annotation: undefined, whyThisAnnotation: undefined, }; return { name: toolCall.toolName, parameters: revisedArgs, }; } } exports.ReplayableInteraction = ReplayableInteraction; ReplayableInteraction.MAX_SELECTOR_FAILOVERS = 3; ReplayableInteraction.MAX_LOCATOR_MATCH_COUNT = 3; //# sourceMappingURL=ReplayableInteraction.js.map