donobu
Version:
Create browser automations with an LLM agent and replay them as Playwright scripts.
323 lines (308 loc) • 15.8 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.locateElement = locateElement;
const v4_1 = require("zod/v4");
const Logger_1 = require("../../../utils/Logger");
const PlaywrightUtils_1 = require("../../../utils/PlaywrightUtils");
const TemplateInterpolator_1 = require("../../../utils/TemplateInterpolator");
const buildLocator_1 = require("./buildLocator");
const domSnapshot_1 = require("./domSnapshot");
const LocateException_1 = require("./LocateException");
const locateSchema_1 = require("./locateSchema");
/** Maximum number of matches for which we attempt per-element disambiguation. */
const DISAMBIGUATE_THRESHOLD = 5;
/** Maximum outer HTML length per snippet shown during disambiguation. */
const SNIPPET_MAX_CHARS = 200;
/**
* Resolve a natural-language element description to a Playwright {@link Locator}.
*
* 1. Captures a viewport screenshot + pruned DOM snapshot.
* 2. Asks the LLM to return a structured {@link LocateResult}.
* 3. Builds a {@link Locator} and counts matches.
* - 1 match → done.
* - 2–{@link DISAMBIGUATE_THRESHOLD} matches → shows snippets, LLM picks one, appends `.nth(n)`.
* - 0 or >{@link DISAMBIGUATE_THRESHOLD} → retries once with feedback.
* 4. Throws {@link LocateException} if resolution fails after retry.
*
* Returns both the Locator and the underlying {@link LocateResult} so
* callers can cache the result for deterministic replay.
*/
async function locateElement(page, description, gptClient, options) {
const envData = options?.envData;
const screenshot = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(page);
const domSnapshot = await (0, domSnapshot_1.captureDomSnapshot)(page);
Logger_1.appLogger.debug(`locate: DOM snapshot captured (${domSnapshot.html.length} chars, ${domSnapshot.omittedCount} nodes omitted)`);
const systemMessage = buildSystemMessage(page.url(), await page.title(), description, envData);
const userMessage = buildUserMessage(description, screenshot, domSnapshot.html);
// First attempt
const firstResult = await callLlm(gptClient, systemMessage, userMessage, options?.signal);
const firstLocator = (0, buildLocator_1.buildLocator)(page, firstResult, envData);
const firstCount = await safeCount(firstLocator);
Logger_1.appLogger.debug(`locate: first attempt matched ${firstCount} element(s)`);
if (firstCount === 1) {
return { locator: firstLocator, result: firstResult };
}
// Disambiguation: small number of matches — show snippets and let LLM pick
if (firstCount > 1 && firstCount <= DISAMBIGUATE_THRESHOLD) {
return await disambiguate(page, description, gptClient, firstLocator, firstResult, firstCount, envData, options?.signal);
}
// Retry: zero matches or too many
const previousAttempt = summarizeLocateResult(firstResult);
let retryDomHtml = domSnapshot.html;
if (firstCount === 0) {
// The target may have been truncated out — retry with a larger DOM budget.
const expandedSnapshot = await (0, domSnapshot_1.captureDomSnapshot)(page, 160_000);
retryDomHtml = expandedSnapshot.html;
Logger_1.appLogger.debug(`locate: expanded DOM snapshot for retry (${retryDomHtml.length} chars, ${expandedSnapshot.omittedCount} nodes omitted)`);
}
const feedback = firstCount === 0
? `Your locator matched no elements on the page. Your previous attempt was: ${previousAttempt}. Examine the DOM snapshot more carefully and try a different approach.`
: `Your locator matched ${firstCount} elements, which is too many to disambiguate. Your previous attempt was: ${previousAttempt}. Write a more specific locator.`;
const retryMessage = buildRetryMessage(description, feedback, screenshot, retryDomHtml);
const retryResult = await callLlm(gptClient, systemMessage, retryMessage, options?.signal);
const retryLocator = (0, buildLocator_1.buildLocator)(page, retryResult, envData);
const retryCount = await safeCount(retryLocator);
Logger_1.appLogger.debug(`locate: retry matched ${retryCount} element(s)`);
if (retryCount === 1) {
return { locator: retryLocator, result: retryResult };
}
if (retryCount > 1 && retryCount <= DISAMBIGUATE_THRESHOLD) {
return await disambiguate(page, description, gptClient, retryLocator, retryResult, retryCount, envData, options?.signal);
}
// Give up
const reason = retryCount === 0 ? 'no_matches' : 'too_many_matches';
throw new LocateException_1.LocateException(description, reason, retryCount === 0
? 'No elements matched after retry.'
: `Still matched ${retryCount} elements after retry.`, retryResult);
}
/**
* Show HTML snippets of each match to the LLM and ask it to pick the
* correct one. Returns the original locator with `.nth(n)` appended.
*/
async function disambiguate(page, description, gptClient, locator, locateResult, count, envData, signal) {
const snippets = [];
for (let i = 0; i < count; i++) {
const nth = locator.nth(i);
try {
const html = await nth.evaluate((el, max) => {
const raw = el.outerHTML;
return raw.length > max ? raw.slice(0, max) + '…' : raw;
}, SNIPPET_MAX_CHARS);
const box = await nth.boundingBox();
const pos = box
? `(${Math.round(box.x)}, ${Math.round(box.y)})`
: '(unknown)';
snippets.push({ index: i, html, position: pos });
}
catch {
snippets.push({
index: i,
html: '<could not read element>',
position: '(unknown)',
});
}
}
const snippetText = snippets
.map((s) => `[${s.index}] position=${s.position}\n ${s.html}`)
.join('\n\n');
const disambigSchema = v4_1.z.object({
index: v4_1.z
.number()
.int()
.min(0)
.max(count - 1)
.describe('Zero-based index of the element that best matches the description.'),
});
// Disambiguation output is just an index — never cached and never fed back
// through `buildLocator`. Show the LLM the resolved description so it can
// match candidate HTML directly without doing mental env-var substitution.
const resolvedDescription = envData && description.includes('{{')
? (0, TemplateInterpolator_1.interpolateString)(description, { env: envData, calls: [] })
: description;
const systemMsg = {
type: 'system',
text: `You are resolving an ambiguous element lookup. The user described an element and your locator matched ${count} candidates. Choose the one that best matches the description.`,
};
const userMsg = {
type: 'user',
items: [
{
type: 'text',
text: `Description: "${resolvedDescription}"\n\nCandidates:\n${snippetText}\n\nReturn the index of the best match.`,
},
],
};
const resp = await gptClient.getStructuredOutput([systemMsg, userMsg], disambigSchema, { signal });
Logger_1.appLogger.debug(`locate: disambiguation chose index ${resp.output.index} of ${count}`);
const disambiguatedResult = {
...locateResult,
nth: resp.output.index,
};
return {
locator: (0, buildLocator_1.buildLocator)(page, disambiguatedResult, envData),
result: disambiguatedResult,
};
}
async function callLlm(gptClient, systemMessage, userMessage, signal) {
const resp = await gptClient.getStructuredOutput([systemMessage, userMessage], locateSchema_1.LocateResultSchema, { signal });
return resp.output;
}
function buildSystemMessage(pageUrl, pageTitle, description, envData) {
// Only annotate the prompt with env-var guidance when the raw description
// actually references at least one provided env var. Keeps the prompt small
// for the common case.
const envEntries = Object.entries(envData ?? {});
const referencedEnvEntries = envEntries.filter(([name]) => description.includes(`{{$.env.${name}}}`));
const envBlock = referencedEnvEntries.length > 0
? `
The user's description contains environment variable references using the syntax
\`{{$.env.NAME}}\`. To keep cached locators valid across runs with different env
values, you MUST emit those same placeholders in any LocatorStep \`text\`,
\`name\`, or \`testId\` field whose contents come from an env var. Do NOT bake
the literal current value into the step.
Original (uninterpolated) description: "${description}"
Current env mapping (use these to identify which substrings on the page came
from which env var, then emit the placeholder rather than the literal):
${referencedEnvEntries.map(([name, value]) => ` - {{$.env.${name}}} = ${JSON.stringify(value)}`).join('\n')}
Hard rules for env-var emission:
- Use placeholders ONLY in \`text\`, \`name\`, or \`testId\` fields.
- NEVER emit \`{{$.env.*}}\` inside \`selector\` (CSS/XPath) — interpolating
raw values into a CSS selector can produce invalid syntax. Use a semantic
locator (getByRole/getByText/getByLabel/getByPlaceholder/getByTestId)
instead when an env-derived value is involved.
- NEVER emit \`{{$.env.*}}\` inside any \`frames[]\` entry (iframe selectors
or iframe \`name\` attributes are not env-driven).
Examples:
- Description "The user row for {{$.env.TEST_EMAIL}}", TEST_EMAIL="alice@x.com",
page text shows "alice@x.com" →
[{ method: "getByText", text: "{{$.env.TEST_EMAIL}}" }]
- Description "The {{$.env.PROJECT_NAME}} tab", PROJECT_NAME="Apollo" →
[{ method: "getByRole", role: "tab", name: "{{$.env.PROJECT_NAME}}" }]
- Description "The submit button" (no env vars referenced) → emit literal text
as you normally would.
Combining env vars with regex: env interpolation runs BEFORE regex compilation,
so you can mix them. Prefer this when the env value should be matched alongside
dynamic page content. Example — description "The row for {{$.env.USER}} with
their score", USER="alice" →
[{ method: "getByText", text: "alice — \\\\d+ pts", textIsRegex: true }]
(Here the AI substituted the env value because it's part of a regex pattern;
the placeholder syntax also works — \`text: "{{$.env.USER}} — \\\\d+ pts"\` —
and is preferred when you want cache stability across env value changes.)`
: '';
return {
type: 'system',
text: `You are a Playwright locator expert. Given a viewport screenshot and a pruned DOM snapshot of a webpage, return a structured locator that targets the element matching the user's description.
Rules:
- Prefer semantic locators: getByRole, getByText, getByLabel, getByPlaceholder, getByTestId.
- Use CSS selectors (method: "locator") only when semantic locators cannot express the target.
- Use at most 3 chained steps. Prefer fewer steps when possible.
- If the element is inside an iframe, specify the frame(s) in the "frames" field.
- Do NOT set "nth" unless you are certain the chain matches multiple elements and you know which index is correct. When unsure, omit it — the system will handle disambiguation.
Stability rules — locators are CACHED and replayed across runs. The page may
change between runs (vote counts increment, "3 hours ago" becomes "5 hours ago",
new posts shift positions, prices fluctuate). Choose locators that survive these
drifts:
- POSITIONAL DESCRIPTIONS: when the description references position ("first",
"third", "fourth from the top", "last"), translate that into a structural
chain plus \`nth\` rather than baking position-specific page text into a step.
Example — "the fourth comments link" should be a locator over ALL comment
links with \`nth: 3\`, not the literal "36 comments" you happen to see today.
- DYNAMIC TEXT: if the value you would put into \`name\` or \`text\` looks
dynamic — contains digits, timestamps, "X ago", "$X.XX", counts, scores,
vote totals — emit a regex pattern via \`nameIsRegex: true\` (for getByRole)
or \`textIsRegex: true\` (for getByText/getByLabel/getByPlaceholder) instead
of the literal value. Anchor the pattern with \`^\` / \`$\` when the whole
string should match, otherwise it acts as a substring match.
- DO NOT combine \`exact: true\` with \`nameIsRegex\`/\`textIsRegex\`. They are
mutually exclusive — set \`exact\` only for literal-string steps with stable
fixed labels like "Submit" or "Sign In".
- SAFE LITERALS: keep literal values for genuinely stable strings — fixed UI
labels, button text like "Submit"/"Cancel", section headings, unique
test-ids. Only escape to regex when stability is at risk.
Examples:
- "The fourth comments link" →
steps: [{ method: "getByRole", role: "link", name: "\\\\d+\\\\s+comments?$", nameIsRegex: true }]
nth: 3
- "The headline of the third story" → structural row selector + nth: 2 (literal name)
- "The submit button" → literal name: "Submit", optionally exact: true
- "The price tag for the cart total" →
steps: [{ method: "getByText", text: "\\\\$\\\\d+(\\\\.\\\\d+)?", textIsRegex: true }]
- "The 'posted 5 hours ago' label" →
steps: [{ method: "getByText", text: "posted \\\\d+ (minute|hour|day)s? ago", textIsRegex: true }]
Regex format: emit a JS-style regex source string (no leading/trailing slash,
no flags). Backslashes inside JSON must be doubled (\`\\\\d+\` not \`\\d+\`).
Invalid patterns silently fall back to literal matching, so prefer simple,
well-tested patterns.
Page URL: ${pageUrl}
Page title: ${pageTitle}${envBlock}`,
};
}
function buildUserMessage(description, screenshot, domHtml) {
return {
type: 'user',
items: [
{ type: 'jpeg', bytes: screenshot },
{
type: 'text',
text: `DOM snapshot:\n${domHtml}`,
},
{
type: 'text',
text: `Find the element matching this description: "${description}"`,
},
],
};
}
function buildRetryMessage(description, feedback, screenshot, domHtml) {
return {
type: 'user',
items: [
{ type: 'jpeg', bytes: screenshot },
{
type: 'text',
text: `DOM snapshot:\n${domHtml}`,
},
{
type: 'text',
text: `${feedback}\n\nFind the element matching this description: "${description}"`,
},
],
};
}
/**
* Produce a human-readable summary of a {@link LocateResult} so the LLM
* (and developers reading logs) can see what was already tried.
*/
function summarizeLocateResult(result) {
const steps = result.steps
.map((s) => {
switch (s.method) {
case 'getByRole':
return `getByRole('${s.role ?? '?'}'${s.name ? `, { name: '${s.name}' }` : ''})`;
case 'getByText':
return `getByText('${s.text ?? '?'}')`;
case 'getByLabel':
return `getByLabel('${s.text ?? '?'}')`;
case 'getByPlaceholder':
return `getByPlaceholder('${s.text ?? '?'}')`;
case 'getByTestId':
return `getByTestId('${s.testId ?? '?'}')`;
case 'locator':
return `locator('${s.selector ?? '?'}')`;
default:
return s.method;
}
})
.join('.');
const nth = result.nth !== undefined ? `.nth(${result.nth})` : '';
return `${steps}${nth}`;
}
async function safeCount(locator) {
try {
return await locator.count();
}
catch {
return 0;
}
}
//# sourceMappingURL=locateElement.js.map