@hyperbrowser/agent
Version:
Hyperbrowsers Web Agent
109 lines (108 loc) • 4.17 kB
JavaScript
;
/**
* ExamineDom - Find elements in accessibility tree based on natural language
*
* Takes a natural language instruction (e.g., "click the login button") and returns
* matching elements from the accessibility tree with confidence scores.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.examineDom = examineDom;
exports.extractValueFromInstruction = extractValueFromInstruction;
const prompts_1 = require("./prompts");
const schema_1 = require("./schema");
/**
* Find elements in the accessibility tree that match the given instruction
*
* @param instruction - Natural language instruction (e.g., "click the login button")
* @param context - Current page context with accessibility tree
* @param llm - LLM client for making inference calls
* @returns Object with matching elements and LLM response
*
* @example
* ```typescript
* const { elements, llmResponse } = await examineDom(
* "click the login button",
* {
* tree: "[0-1234] button: Login\n[0-5678] button: Sign Up",
* xpathMap: { "0-1234": "/html/body/button[1]" },
* elements: new Map(),
* url: "https://example.com"
* },
* llmClient
* );
*
* // Returns: { elements: [...], llmResponse: { rawText: "...", parsed: {...} } }
* ```
*/
async function examineDom(instruction, context, llm) {
// Build prompts for element finding
const systemPrompt = (0, prompts_1.buildExamineDomSystemPrompt)();
const userPrompt = (0, prompts_1.buildExamineDomUserPrompt)(instruction, context.tree);
try {
// Call LLM with structured output to find elements
const response = await llm.invokeStructured({
schema: schema_1.ExamineDomResultsSchema,
options: {
temperature: 0, // Deterministic for element finding
},
}, [
{ role: "system", content: systemPrompt },
{ role: "user", content: userPrompt },
]);
const llmResponse = {
rawText: response.rawText,
parsed: response.parsed,
};
if (!response.parsed || !response.parsed.elements) {
// No elements found or parsing failed
return { elements: [], llmResponse };
}
// Sort by confidence descending (highest confidence first)
const results = response.parsed.elements.sort((a, b) => b.confidence - a.confidence);
// Validate that elementIds exist in the context
const validatedResults = results.filter((result) => {
// Check if elementId exists in the provided elements map or xpathMap
const existsInElements = context.elements.has(result.elementId);
const existsInXpathMap = context.xpathMap[result.elementId] !== undefined;
if (!existsInElements && !existsInXpathMap) {
console.warn(`[examineDom] Element ${result.elementId} not found in context, skipping`);
return false;
}
return true;
});
return { elements: validatedResults, llmResponse };
}
catch (error) {
console.error("[examineDom] Error finding elements:", error);
// Return empty result on error (graceful degradation)
return {
elements: [],
llmResponse: {
rawText: "",
parsed: null,
},
};
}
}
/**
* Extract text value from instruction for fill actions
*
* Extracts the value to be filled from instructions like:
* - "fill email with test@example.com" → "test@example.com"
* - "type hello into search box" → "hello"
* - "enter password123 in password field" → "password123"
*
* @param instruction - The natural language instruction
* @returns The extracted value or empty string if no value found
*/
function extractValueFromInstruction(instruction) {
// Pattern: "with X", "into X", "in X"
const patterns = [/with\s+(.+)$/i, /into\s+(.+)$/i, /in\s+(.+)$/i];
for (const pattern of patterns) {
const match = instruction.match(pattern);
if (match) {
return match[1].trim();
}
}
return "";
}