browser-use-typescript
Version:
A TypeScript-based browser automation framework
618 lines โข 32.8 kB
JavaScript
import { PromptTemplate } from "@langchain/core/prompts";
import { ClickElementActionSchema, DoneActionSchema, GoToUrlActionSchema, InputTextActionSchema, NoParamsActionSchema, OpenTabActionSchema, ScrollActionSchema, SendKeysActionSchema, SwitchTabActionSchema, ExtractPageContentActionSchema, SearchGoogleActionSchema, ClickElementBySelectorActionSchema, ClickElementByXpathActionSchema, ClickElementByTextActionSchema, ScrolltoTextActionSchema } from "./types";
import { Registry } from "./registry/service";
import { ActionResult } from "../agent/types";
import { logger } from "../agent/agentImplement/agentClass";
import TurndownService from "turndown";
export class PageInfo {
info = [];
}
export class Controller {
registry;
pageInfo;
constructor() {
this.registry = new Registry();
this.pageInfo = new PageInfo();
/**
* Constructor
* Create a new Controller
* @param excludeActions List of action names to exclude from registration
* @param outputModel Optional output model for the done action
*/
/**
* Register all default browser actions
* @param outputModel Optional output model for the done action
*/
this.registry.registerAction('done', 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', DoneActionSchema, async function done(params) {
const doneActionSchema = DoneActionSchema.parse(params);
return new ActionResult({
isDone: true,
success: doneActionSchema.success,
extractedContent: doneActionSchema.text
});
});
// Basic Navigation Actions
this.registry.registerAction('search', 'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items.', SearchGoogleActionSchema, async function search(params, browserContext) {
//Check Captcha
// Log initial state for debugging
// Check if Google has captcha based on existing pageInfo
// Validate parameters with Zod schema
logger.debug(params);
const validatedParams = SearchGoogleActionSchema.parse(params);
const query = validatedParams.query;
const browser = browserContext;
if (!browser) {
return new ActionResult({
error: "Browser context is missing",
success: false
});
}
const page = await browser.get_current_page();
await page.goto(`https://www.google.com/search?q=${query}&udm=14`);
await page.waitForLoadState();
// Store a hard reference to the controller instance
// Define a function that will handle the captcha update
// This is defined at the class member level, bound to this specific instance
// Check for possible captcha indicators
const isNotRobot = await (browser.get_locate_element_by_text('not a robot') ||
browser.get_locate_element_by_text('a human'));
if (await isNotRobot?.isVisible()) {
console.log("Potential captcha detected, adding visual indicator button");
// Expose the function that will be called from the browser
// Add the button for AI to see and click if it confirms captcha
await page.evaluate((query) => {
if (!document.getElementById("captcha-bypass-btn")) {
const button = document.createElement('button');
button.id = "captcha-bypass-btn";
button.style.cssText = `
position: fixed;
top: 20px;
right: 20px;
z-index: 9999;
padding: 10px 20px;
background-color: #4CAF50;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-weight: bold;
`;
button.textContent = 'If you see captcha, click here';
// When button is clicked, update controller and redirect
button.onclick = function () {
// Call the exposed function with parameters
window.location.href = "https://www.bing.com/search?q=" + query;
};
document.body.appendChild(button);
}
}, query);
const msg = `๐ Searched for "${query}" in Google (captcha check button added)`;
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
const msg = `๐ Searched for "${query}" in Google`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
this.registry.registerAction('goToUrl', 'Navigate to URL in the current tab', GoToUrlActionSchema, async function goToUrl(params, browser) {
const url = GoToUrlActionSchema.parse(params);
const page = await browser.get_current_page();
await page.goto(url.url);
await page.waitForLoadState();
const msg = `๐ Navigated to ${url.url}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
this.registry.registerAction('goBack', 'Go back', NoParamsActionSchema, async function goBack(_, browser) {
await browser.go_back();
const msg = '๐ Navigated back';
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Wait for x seconds
this.registry.registerAction('wait', 'Wait for x seconds default 3', NoParamsActionSchema, async function wait(_) {
const msg = `๐ Waiting for 3 seconds` + _;
logger.debug(msg);
await new Promise(resolve => setTimeout(resolve, 3 * 1000));
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Element Interaction Actions
this.registry.registerAction('clickElement', 'Click element', ClickElementActionSchema, async function clickElement(params, browser) {
const clickElementActionSchema = ClickElementActionSchema.parse(params);
const session = await browser.get_session();
const selectorMap = await browser.get_selector_map();
if (!(clickElementActionSchema.index in selectorMap)) {
return new ActionResult({
extractedContent: `Element with index ${clickElementActionSchema.index} does not exist - retry or use alternative actions`,
includeInMemory: true
});
}
const elementNode = await browser.get_dom_element_by_index(clickElementActionSchema.index);
const initialPages = session.context.pages.length;
// Check if element has file uploader
if (await browser.is_file_uploader(elementNode)) {
const msg = `Index ${clickElementActionSchema.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
let msg = null;
try {
const downloadPath = await browser._click_element_node(elementNode);
if (downloadPath) {
msg = `๐พ Downloaded file to ${downloadPath}`;
}
else {
msg = `๐ฑ๏ธ Clicked button with index ${clickElementActionSchema.index}: ${elementNode.getAllTextTillNextClickableElement(2)}`;
}
logger.debug(msg);
logger.debug(`Element xpath: ${elementNode.xpath}`);
if (session.context.pages.length > initialPages) {
const newTabMsg = 'New tab opened - switching to it';
msg += ` - ${newTabMsg}`;
logger.debug(newTabMsg);
await browser.switch_to_tab(-1);
}
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
catch (e) {
// Get more information about the element to help diagnose the issue
let errorDetails = '';
try {
// Try to get additional element properties to diagnose the issue
const elementHandle = await browser.get_element_by_index(clickElementActionSchema.index);
if (elementHandle) {
const isVisible = await elementHandle.isVisible();
const isEnabled = await elementHandle.isEnabled();
// Use Playwright's API to check if element is in viewport
const boundingBox = await elementHandle.boundingBox();
let isInViewport = false;
if (boundingBox) {
const page = await browser.get_current_page();
const viewportSize = page.viewportSize();
if (viewportSize) {
isInViewport = (boundingBox.x >= 0 &&
boundingBox.y >= 0 &&
boundingBox.x + boundingBox.width <= viewportSize.width &&
boundingBox.y + boundingBox.height <= viewportSize.height);
}
}
errorDetails = `Element properties: visible=${isVisible}, enabled=${isEnabled}, inViewport=${isInViewport}`;
// Add position information if available
if (boundingBox) {
errorDetails += `, position=[x:${Math.round(boundingBox.x)},y:${Math.round(boundingBox.y)},w:${Math.round(boundingBox.width)},h:${Math.round(boundingBox.height)}]`;
}
}
else {
errorDetails = 'Element handle could not be retrieved';
}
}
catch (diagError) {
errorDetails = `Could not diagnose element state: ${String(diagError)}`;
}
logger.debug(`Element not clickable with index ${clickElementActionSchema.index} - ${errorDetails}`);
logger.debug(`Click error details: ${String(e)}`);
return new ActionResult({
error: `Failed to click element with index ${clickElementActionSchema.index}: ${String(e)}\n${errorDetails}`
});
}
});
this.registry.registerAction('inputText', 'Input text into a input interactive element', InputTextActionSchema, async function inputText(params, browser, hasSensitiveData = false) {
const inputTextActionSchema = InputTextActionSchema.parse(params);
const selectorMap = await browser.get_selector_map();
if (!(inputTextActionSchema.index in selectorMap)) {
throw new Error(`Element index ${inputTextActionSchema.index} does not exist - retry or use alternative actions`);
}
const elementNode = await browser.get_dom_element_by_index(inputTextActionSchema.index);
await browser._input_text_element_node(elementNode, inputTextActionSchema.text);
let msg;
if (!hasSensitiveData) {
msg = `โจ๏ธ Input ${inputTextActionSchema.text} into index ${inputTextActionSchema.index}`;
}
else {
msg = `โจ๏ธ Input sensitive data into index ${inputTextActionSchema.index}`;
}
logger.debug(msg);
logger.debug(`Element xpath: ${elementNode.xpath}`);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Save PDF
this.registry.registerAction('savePdf', 'Save the current page as a PDF file', NoParamsActionSchema, async function savePdf(_, browser) {
const page = await browser.get_current_page();
const shortUrl = page.url().replace(/^https?:\/\/(?:www\.)?|\/$/g, '');
const slug = shortUrl.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-|-$/g, '').toLowerCase();
const sanitizedFilename = `${slug}.pdf`;
await page.emulateMedia({ media: 'screen' });
await page.pdf({ path: sanitizedFilename, format: 'A4', printBackground: false });
const msg = `Saving page with URL ${page.url()} as PDF to ./${sanitizedFilename}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Tab Management Actions
this.registry.registerAction('switchTab', 'Switch tab', SwitchTabActionSchema, async function switchTab(params, browser) {
const switchTabActionSchema = SwitchTabActionSchema.parse(params);
await browser.switch_to_tab(switchTabActionSchema.page_id);
// Wait for tab to be ready
const page = await browser.get_current_page();
await page.waitForLoadState();
const msg = `๐ Switched to tab ${switchTabActionSchema.page_id}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
this.registry.registerAction('openTab', 'Open url in new tab', OpenTabActionSchema, async function openTab(params, browser) {
const openTabActionSchema = OpenTabActionSchema.parse(params);
await browser.create_new_tab(openTabActionSchema.url);
const msg = `๐ Opened new tab with ${openTabActionSchema.url}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Content Actions
this.registry.registerAction('extractContent', 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links', ExtractPageContentActionSchema, async function extractContent(params, browser, pageExtractionLlm) {
const extractParams = ExtractPageContentActionSchema.parse(params);
const goal = extractParams.value;
const page = await browser.get_current_page();
// Use Turndown instead of markdownify
const turndownService = new TurndownService();
const content = turndownService.turndown(await page.content());
const prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}';
const template = new PromptTemplate({
inputVariables: ['goal', 'page'],
template: prompt
});
try {
const formattedPrompt = await template.format({ goal, page: content });
const output = await pageExtractionLlm.invoke(formattedPrompt);
const msg = `๐ Extracted from page\n: ${output.content}\n`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
catch (e) {
logger.debug(`Error extracting content: ${e}`);
const msg = `๐ Extracted from page\n: ${content}\n`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg
});
}
});
// Scroll Actions
this.registry.registerAction('scrollDown', 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page', ScrollActionSchema, async function scrollDown(params, browser) {
const scrollActionSchema = ScrollActionSchema.parse(params);
const page = await browser.get_current_page();
if (scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined) {
await page.evaluate(`window.scrollBy(0, ${scrollActionSchema.amount});`);
}
else {
await page.evaluate('window.scrollBy(0, window.innerHeight);');
}
const amount = scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined ? `${scrollActionSchema.amount} pixels` : 'one page';
const msg = `๐ Scrolled down the page by ${amount}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
this.registry.registerAction('scrollUp', 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page', ScrollActionSchema, async function scrollUp(params, browser) {
const scrollActionSchema = ScrollActionSchema.parse(params);
const page = await browser.get_current_page();
if (scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined) {
await page.evaluate(`window.scrollBy(0, -${scrollActionSchema.amount});`);
}
else {
await page.evaluate('window.scrollBy(0, -window.innerHeight);');
}
const amount = scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined ? `${scrollActionSchema.amount} pixels` : 'one page';
const msg = `๐ Scrolled up the page by ${amount}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Send Keys
this.registry.registerAction('sendKeys', 'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press.', SendKeysActionSchema, async function sendKeys(params, browser) {
const sendKeysActionSchema = SendKeysActionSchema.parse(params);
const page = await browser.get_current_page();
try {
await page.keyboard.press(sendKeysActionSchema.keys);
}
catch (e) {
if (String(e).includes('Unknown key')) {
// Loop over the keys and try to send each one
for (const key of sendKeysActionSchema.keys) {
try {
await page.keyboard.press(key);
}
catch (keyError) {
logger.debug(`Error sending key ${key}: ${String(keyError)}`);
throw keyError;
}
}
}
else {
throw e;
}
}
const msg = `โจ๏ธ Sent keys: ${sendKeysActionSchema.keys}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
this.registry.registerAction('clickElementBySelector', 'Click element by selector', ClickElementActionSchema, async function clickElementBySelector(params, browser) {
const parsedParams = ClickElementBySelectorActionSchema.parse(params);
const elementNode = await browser.get_locate_element_by_css_selector(parsedParams.css_selector);
if (elementNode) {
await elementNode.click();
const msg = `๐ฑ๏ธ Clicked on element with selector ${parsedParams.css_selector}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
else {
const msg = `Element not clickable with selector ${parsedParams.css_selector} - most likely the page changed`;
logger.debug(msg);
return new ActionResult({
error: msg
});
}
});
this.registry.registerAction('clickElementByXpath', 'Click on element by xpath', ClickElementActionSchema, async function clickElementByXpath(params, browser) {
const parsedParams = ClickElementByXpathActionSchema.parse(params);
const elementNode = await browser.get_locate_element_by_xpath(parsedParams.xpath);
if (elementNode) {
await elementNode.click();
const msg = `๐ฑ๏ธ Clicked on element with xpath ${parsedParams.xpath}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
else {
const msg = `Element not clickable with xpath ${parsedParams.xpath} - most likely the page changed`;
logger.debug(msg);
return new ActionResult({
error: msg
});
}
});
this.registry.registerAction('clickElementByText', 'Click element with text', ClickElementByTextActionSchema, async function clickElementByText(params, browser) {
const parsedParams = ClickElementByTextActionSchema.parse(params);
const elementNode = await browser.get_locate_element_by_text(parsedParams.text, parsedParams.nth);
if (elementNode) {
await elementNode.click();
const msg = `๐ฑ๏ธ Clicked on element with text ${parsedParams.text}`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
else {
const msg = `No element found for text '${parsedParams.text}'`;
logger.debug(msg);
return new ActionResult({
error: msg
});
}
});
this.registry.registerAction('closeTab', 'Close tab', NoParamsActionSchema, async function closeTab(_, browser) {
await browser.close_current_tab();
const msg = 'Closed tab';
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
});
// Scroll to Text
this.registry.registerAction('scrollToText', 'If you dont find something which you want to interact with, scroll to it', ScrolltoTextActionSchema, async function scrollToText(params, browser) {
const scrollActionSchema = ScrolltoTextActionSchema.parse(params);
const text = scrollActionSchema.text;
const page = await browser.get_current_page();
try {
logger.debug(`๐ Attempting to find and scroll to text: "${text}"`);
// Try to find element by text using the browser context's method
const elementByText = await browser.get_locate_element_by_text(text);
if (elementByText) {
// Element found by text directly
await elementByText.scrollIntoViewIfNeeded();
await page.waitForTimeout(500); // Wait for scroll to complete
const msg = `Scrolled to text: "${text}"`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
// Try to find element by XPath containing text
const xpathText = `//*[contains(text(), '${text}')]`;
const elementByXPath = await browser.get_locate_element_by_xpath(xpathText);
if (elementByXPath) {
// Element found by XPath
await elementByXPath.scrollIntoViewIfNeeded();
await page.waitForTimeout(500);
const msg = `Scrolled to text: "${text}" (via XPath)`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
// Progressive scroll approach if element not found directly
logger.debug(`Element with text "${text}" not found directly. Trying progressive scroll search...`);
// Get page dimensions without explicit DOM types
const dimensions = await page.evaluate("({ windowHeight: window.innerHeight, documentHeight: document.body.scrollHeight })");
// Calculate scroll iterations
const windowHeight = dimensions.windowHeight;
const documentHeight = dimensions.documentHeight;
const scrollStep = Math.floor(windowHeight * 0.8); // 80% of viewport
const maxScrolls = Math.ceil(documentHeight / scrollStep) + 1;
// Progressively scroll and check for text
for (let i = 0; i < maxScrolls; i++) {
// Scroll down by a portion of the viewport
await page.evaluate(`window.scrollBy(0, ${scrollStep});`);
await page.waitForTimeout(300); // Wait for content to load
// Try searching again after scrolling
const elementAfterScroll = await browser.get_locate_element_by_text(text);
if (elementAfterScroll) {
await elementAfterScroll.scrollIntoViewIfNeeded();
await page.waitForTimeout(500);
const msg = `Scrolled to text: "${text}" (after progressive scroll)`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
// Also try XPath search
const xpathElementAfterScroll = await browser.get_locate_element_by_xpath(xpathText);
if (xpathElementAfterScroll) {
await xpathElementAfterScroll.scrollIntoViewIfNeeded();
await page.waitForTimeout(500);
const msg = `Scrolled to text: "${text}" (via XPath after progressive scroll)`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
}
// Try DOM search through simple browser evaluation as last resort
const foundByEvaluation = await page.evaluate(`
(function() {
// Simple text search in all element contents
const allElements = document.querySelectorAll('*');
for (const el of allElements) {
if (el.textContent && el.textContent.includes("${text.replace(/"/g, '\\"')}")) {
el.scrollIntoView({
behavior: 'smooth',
block: 'center'
});
return true;
}
}
return false;
})()
`);
if (foundByEvaluation) {
await page.waitForTimeout(500);
const msg = `Scrolled to text: "${text}" (via DOM traversal)`;
logger.debug(msg);
return new ActionResult({
extractedContent: msg,
includeInMemory: true
});
}
// Text not found in all attempts
const notFoundMsg = `Text "${text}" not found on page after extensive search`;
logger.debug(notFoundMsg);
return new ActionResult({
extractedContent: notFoundMsg,
includeInMemory: true,
success: false
});
}
catch (error) {
const errorMsg = `Error scrolling to text "${text}": ${String(error)}`;
logger.debug(errorMsg);
return new ActionResult({
error: errorMsg,
includeInMemory: true
});
}
});
}
/**
* Decorator for registering custom actions
* @param description Describe what the function does
*/
/**
* Execute an action
* @param action The action to execute
* @param browserContext The browser context
* @param pageExtractionLlm Optional LLM for page extraction
* @param sensitiveData Optional sensitive data
* @param availableFilePaths Optional available file paths
* @param context Optional context
*/
async act(action, browserContext, pageExtractionLlm, sensitiveData, availableFilePaths, context) {
try {
// Get the action name and parameters from the ActionModel
if (!action.actions || Object.keys(action.actions).length === 0) {
return new ActionResult({
error: "No actions defined in ActionModel",
success: false
});
}
// Get the first action name (there should only be one)
const actionName = Object.keys(action.actions)[0];
// Get the parameters either from paramToJson() or directly from params
const params = action.params || {};
logger.success(`๐ฎExecuting action: ${actionName} โ๏ธ${JSON.stringify(params)}`);
// Create a context object with all required parameters
const contextObj = {
browser: browserContext,
pageExtractionLlm,
sensitiveData,
availableFilePaths,
userContext: context
};
// Pass the parameters as needed by the registry's executeAction method
const result = await this.registry.executeAction(actionName, params, contextObj);
if (result.error) {
if (result.error.length > 0) {
logger.debuggerError(`๐ฎProblem Executing ${actionName}: ${result.error}`);
}
}
return result;
}
catch (e) {
logger.debug(`Error in Controller.act: ${e.message}`);
return new ActionResult({
error: `Error executing action: ${e.message}`,
success: false
});
}
}
}
//# sourceMappingURL=controllerContext.js.map