UNPKG

browser-use-typescript

Version:

A TypeScript-based browser automation framework

618 lines โ€ข 32.8 kB
import { PromptTemplate } from "@langchain/core/prompts"; import { ClickElementActionSchema, DoneActionSchema, GoToUrlActionSchema, InputTextActionSchema, NoParamsActionSchema, OpenTabActionSchema, ScrollActionSchema, SendKeysActionSchema, SwitchTabActionSchema, ExtractPageContentActionSchema, SearchGoogleActionSchema, ClickElementBySelectorActionSchema, ClickElementByXpathActionSchema, ClickElementByTextActionSchema, ScrolltoTextActionSchema } from "./types"; import { Registry } from "./registry/service"; import { ActionResult } from "../agent/types"; import { logger } from "../agent/agentImplement/agentClass"; import TurndownService from "turndown"; export class PageInfo { info = []; } export class Controller { registry; pageInfo; constructor() { this.registry = new Registry(); this.pageInfo = new PageInfo(); /** * Constructor * Create a new Controller * @param excludeActions List of action names to exclude from registration * @param outputModel Optional output model for the done action */ /** * Register all default browser actions * @param outputModel Optional output model for the done action */ this.registry.registerAction('done', 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', DoneActionSchema, async function done(params) { const doneActionSchema = DoneActionSchema.parse(params); return new ActionResult({ isDone: true, success: doneActionSchema.success, extractedContent: doneActionSchema.text }); }); // Basic Navigation Actions this.registry.registerAction('search', 'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items.', SearchGoogleActionSchema, async function search(params, browserContext) { //Check Captcha // Log initial state for debugging // Check if Google has captcha based on existing pageInfo // Validate parameters with Zod schema logger.debug(params); const validatedParams = SearchGoogleActionSchema.parse(params); const query = validatedParams.query; const browser = browserContext; if (!browser) { return new ActionResult({ error: "Browser context is missing", success: false }); } const page = await browser.get_current_page(); await page.goto(`https://www.google.com/search?q=${query}&udm=14`); await page.waitForLoadState(); // Store a hard reference to the controller instance // Define a function that will handle the captcha update // This is defined at the class member level, bound to this specific instance // Check for possible captcha indicators const isNotRobot = await (browser.get_locate_element_by_text('not a robot') || browser.get_locate_element_by_text('a human')); if (await isNotRobot?.isVisible()) { console.log("Potential captcha detected, adding visual indicator button"); // Expose the function that will be called from the browser // Add the button for AI to see and click if it confirms captcha await page.evaluate((query) => { if (!document.getElementById("captcha-bypass-btn")) { const button = document.createElement('button'); button.id = "captcha-bypass-btn"; button.style.cssText = ` position: fixed; top: 20px; right: 20px; z-index: 9999; padding: 10px 20px; background-color: #4CAF50; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: bold; `; button.textContent = 'If you see captcha, click here'; // When button is clicked, update controller and redirect button.onclick = function () { // Call the exposed function with parameters window.location.href = "https://www.bing.com/search?q=" + query; }; document.body.appendChild(button); } }, query); const msg = `๐Ÿ” Searched for "${query}" in Google (captcha check button added)`; return new ActionResult({ extractedContent: msg, includeInMemory: true }); } const msg = `๐Ÿ” Searched for "${query}" in Google`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); this.registry.registerAction('goToUrl', 'Navigate to URL in the current tab', GoToUrlActionSchema, async function goToUrl(params, browser) { const url = GoToUrlActionSchema.parse(params); const page = await browser.get_current_page(); await page.goto(url.url); await page.waitForLoadState(); const msg = `๐Ÿ”— Navigated to ${url.url}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); this.registry.registerAction('goBack', 'Go back', NoParamsActionSchema, async function goBack(_, browser) { await browser.go_back(); const msg = '๐Ÿ”™ Navigated back'; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Wait for x seconds this.registry.registerAction('wait', 'Wait for x seconds default 3', NoParamsActionSchema, async function wait(_) { const msg = `๐Ÿ•’ Waiting for 3 seconds` + _; logger.debug(msg); await new Promise(resolve => setTimeout(resolve, 3 * 1000)); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Element Interaction Actions this.registry.registerAction('clickElement', 'Click element', ClickElementActionSchema, async function clickElement(params, browser) { const clickElementActionSchema = ClickElementActionSchema.parse(params); const session = await browser.get_session(); const selectorMap = await browser.get_selector_map(); if (!(clickElementActionSchema.index in selectorMap)) { return new ActionResult({ extractedContent: `Element with index ${clickElementActionSchema.index} does not exist - retry or use alternative actions`, includeInMemory: true }); } const elementNode = await browser.get_dom_element_by_index(clickElementActionSchema.index); const initialPages = session.context.pages.length; // Check if element has file uploader if (await browser.is_file_uploader(elementNode)) { const msg = `Index ${clickElementActionSchema.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } let msg = null; try { const downloadPath = await browser._click_element_node(elementNode); if (downloadPath) { msg = `๐Ÿ’พ Downloaded file to ${downloadPath}`; } else { msg = `๐Ÿ–ฑ๏ธ Clicked button with index ${clickElementActionSchema.index}: ${elementNode.getAllTextTillNextClickableElement(2)}`; } logger.debug(msg); logger.debug(`Element xpath: ${elementNode.xpath}`); if (session.context.pages.length > initialPages) { const newTabMsg = 'New tab opened - switching to it'; msg += ` - ${newTabMsg}`; logger.debug(newTabMsg); await browser.switch_to_tab(-1); } return new ActionResult({ extractedContent: msg, includeInMemory: true }); } catch (e) { // Get more information about the element to help diagnose the issue let errorDetails = ''; try { // Try to get additional element properties to diagnose the issue const elementHandle = await browser.get_element_by_index(clickElementActionSchema.index); if (elementHandle) { const isVisible = await elementHandle.isVisible(); const isEnabled = await elementHandle.isEnabled(); // Use Playwright's API to check if element is in viewport const boundingBox = await elementHandle.boundingBox(); let isInViewport = false; if (boundingBox) { const page = await browser.get_current_page(); const viewportSize = page.viewportSize(); if (viewportSize) { isInViewport = (boundingBox.x >= 0 && boundingBox.y >= 0 && boundingBox.x + boundingBox.width <= viewportSize.width && boundingBox.y + boundingBox.height <= viewportSize.height); } } errorDetails = `Element properties: visible=${isVisible}, enabled=${isEnabled}, inViewport=${isInViewport}`; // Add position information if available if (boundingBox) { errorDetails += `, position=[x:${Math.round(boundingBox.x)},y:${Math.round(boundingBox.y)},w:${Math.round(boundingBox.width)},h:${Math.round(boundingBox.height)}]`; } } else { errorDetails = 'Element handle could not be retrieved'; } } catch (diagError) { errorDetails = `Could not diagnose element state: ${String(diagError)}`; } logger.debug(`Element not clickable with index ${clickElementActionSchema.index} - ${errorDetails}`); logger.debug(`Click error details: ${String(e)}`); return new ActionResult({ error: `Failed to click element with index ${clickElementActionSchema.index}: ${String(e)}\n${errorDetails}` }); } }); this.registry.registerAction('inputText', 'Input text into a input interactive element', InputTextActionSchema, async function inputText(params, browser, hasSensitiveData = false) { const inputTextActionSchema = InputTextActionSchema.parse(params); const selectorMap = await browser.get_selector_map(); if (!(inputTextActionSchema.index in selectorMap)) { throw new Error(`Element index ${inputTextActionSchema.index} does not exist - retry or use alternative actions`); } const elementNode = await browser.get_dom_element_by_index(inputTextActionSchema.index); await browser._input_text_element_node(elementNode, inputTextActionSchema.text); let msg; if (!hasSensitiveData) { msg = `โŒจ๏ธ Input ${inputTextActionSchema.text} into index ${inputTextActionSchema.index}`; } else { msg = `โŒจ๏ธ Input sensitive data into index ${inputTextActionSchema.index}`; } logger.debug(msg); logger.debug(`Element xpath: ${elementNode.xpath}`); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Save PDF this.registry.registerAction('savePdf', 'Save the current page as a PDF file', NoParamsActionSchema, async function savePdf(_, browser) { const page = await browser.get_current_page(); const shortUrl = page.url().replace(/^https?:\/\/(?:www\.)?|\/$/g, ''); const slug = shortUrl.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-|-$/g, '').toLowerCase(); const sanitizedFilename = `${slug}.pdf`; await page.emulateMedia({ media: 'screen' }); await page.pdf({ path: sanitizedFilename, format: 'A4', printBackground: false }); const msg = `Saving page with URL ${page.url()} as PDF to ./${sanitizedFilename}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Tab Management Actions this.registry.registerAction('switchTab', 'Switch tab', SwitchTabActionSchema, async function switchTab(params, browser) { const switchTabActionSchema = SwitchTabActionSchema.parse(params); await browser.switch_to_tab(switchTabActionSchema.page_id); // Wait for tab to be ready const page = await browser.get_current_page(); await page.waitForLoadState(); const msg = `๐Ÿ”„ Switched to tab ${switchTabActionSchema.page_id}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); this.registry.registerAction('openTab', 'Open url in new tab', OpenTabActionSchema, async function openTab(params, browser) { const openTabActionSchema = OpenTabActionSchema.parse(params); await browser.create_new_tab(openTabActionSchema.url); const msg = `๐Ÿ”— Opened new tab with ${openTabActionSchema.url}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Content Actions this.registry.registerAction('extractContent', 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links', ExtractPageContentActionSchema, async function extractContent(params, browser, pageExtractionLlm) { const extractParams = ExtractPageContentActionSchema.parse(params); const goal = extractParams.value; const page = await browser.get_current_page(); // Use Turndown instead of markdownify const turndownService = new TurndownService(); const content = turndownService.turndown(await page.content()); const prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'; const template = new PromptTemplate({ inputVariables: ['goal', 'page'], template: prompt }); try { const formattedPrompt = await template.format({ goal, page: content }); const output = await pageExtractionLlm.invoke(formattedPrompt); const msg = `๐Ÿ“„ Extracted from page\n: ${output.content}\n`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } catch (e) { logger.debug(`Error extracting content: ${e}`); const msg = `๐Ÿ“„ Extracted from page\n: ${content}\n`; logger.debug(msg); return new ActionResult({ extractedContent: msg }); } }); // Scroll Actions this.registry.registerAction('scrollDown', 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page', ScrollActionSchema, async function scrollDown(params, browser) { const scrollActionSchema = ScrollActionSchema.parse(params); const page = await browser.get_current_page(); if (scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined) { await page.evaluate(`window.scrollBy(0, ${scrollActionSchema.amount});`); } else { await page.evaluate('window.scrollBy(0, window.innerHeight);'); } const amount = scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined ? `${scrollActionSchema.amount} pixels` : 'one page'; const msg = `๐Ÿ” Scrolled down the page by ${amount}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); this.registry.registerAction('scrollUp', 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page', ScrollActionSchema, async function scrollUp(params, browser) { const scrollActionSchema = ScrollActionSchema.parse(params); const page = await browser.get_current_page(); if (scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined) { await page.evaluate(`window.scrollBy(0, -${scrollActionSchema.amount});`); } else { await page.evaluate('window.scrollBy(0, -window.innerHeight);'); } const amount = scrollActionSchema.amount !== null && scrollActionSchema.amount !== undefined ? `${scrollActionSchema.amount} pixels` : 'one page'; const msg = `๐Ÿ” Scrolled up the page by ${amount}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Send Keys this.registry.registerAction('sendKeys', 'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press.', SendKeysActionSchema, async function sendKeys(params, browser) { const sendKeysActionSchema = SendKeysActionSchema.parse(params); const page = await browser.get_current_page(); try { await page.keyboard.press(sendKeysActionSchema.keys); } catch (e) { if (String(e).includes('Unknown key')) { // Loop over the keys and try to send each one for (const key of sendKeysActionSchema.keys) { try { await page.keyboard.press(key); } catch (keyError) { logger.debug(`Error sending key ${key}: ${String(keyError)}`); throw keyError; } } } else { throw e; } } const msg = `โŒจ๏ธ Sent keys: ${sendKeysActionSchema.keys}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); this.registry.registerAction('clickElementBySelector', 'Click element by selector', ClickElementActionSchema, async function clickElementBySelector(params, browser) { const parsedParams = ClickElementBySelectorActionSchema.parse(params); const elementNode = await browser.get_locate_element_by_css_selector(parsedParams.css_selector); if (elementNode) { await elementNode.click(); const msg = `๐Ÿ–ฑ๏ธ Clicked on element with selector ${parsedParams.css_selector}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } else { const msg = `Element not clickable with selector ${parsedParams.css_selector} - most likely the page changed`; logger.debug(msg); return new ActionResult({ error: msg }); } }); this.registry.registerAction('clickElementByXpath', 'Click on element by xpath', ClickElementActionSchema, async function clickElementByXpath(params, browser) { const parsedParams = ClickElementByXpathActionSchema.parse(params); const elementNode = await browser.get_locate_element_by_xpath(parsedParams.xpath); if (elementNode) { await elementNode.click(); const msg = `๐Ÿ–ฑ๏ธ Clicked on element with xpath ${parsedParams.xpath}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } else { const msg = `Element not clickable with xpath ${parsedParams.xpath} - most likely the page changed`; logger.debug(msg); return new ActionResult({ error: msg }); } }); this.registry.registerAction('clickElementByText', 'Click element with text', ClickElementByTextActionSchema, async function clickElementByText(params, browser) { const parsedParams = ClickElementByTextActionSchema.parse(params); const elementNode = await browser.get_locate_element_by_text(parsedParams.text, parsedParams.nth); if (elementNode) { await elementNode.click(); const msg = `๐Ÿ–ฑ๏ธ Clicked on element with text ${parsedParams.text}`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } else { const msg = `No element found for text '${parsedParams.text}'`; logger.debug(msg); return new ActionResult({ error: msg }); } }); this.registry.registerAction('closeTab', 'Close tab', NoParamsActionSchema, async function closeTab(_, browser) { await browser.close_current_tab(); const msg = 'Closed tab'; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); }); // Scroll to Text this.registry.registerAction('scrollToText', 'If you dont find something which you want to interact with, scroll to it', ScrolltoTextActionSchema, async function scrollToText(params, browser) { const scrollActionSchema = ScrolltoTextActionSchema.parse(params); const text = scrollActionSchema.text; const page = await browser.get_current_page(); try { logger.debug(`๐Ÿ” Attempting to find and scroll to text: "${text}"`); // Try to find element by text using the browser context's method const elementByText = await browser.get_locate_element_by_text(text); if (elementByText) { // Element found by text directly await elementByText.scrollIntoViewIfNeeded(); await page.waitForTimeout(500); // Wait for scroll to complete const msg = `Scrolled to text: "${text}"`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } // Try to find element by XPath containing text const xpathText = `//*[contains(text(), '${text}')]`; const elementByXPath = await browser.get_locate_element_by_xpath(xpathText); if (elementByXPath) { // Element found by XPath await elementByXPath.scrollIntoViewIfNeeded(); await page.waitForTimeout(500); const msg = `Scrolled to text: "${text}" (via XPath)`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } // Progressive scroll approach if element not found directly logger.debug(`Element with text "${text}" not found directly. Trying progressive scroll search...`); // Get page dimensions without explicit DOM types const dimensions = await page.evaluate("({ windowHeight: window.innerHeight, documentHeight: document.body.scrollHeight })"); // Calculate scroll iterations const windowHeight = dimensions.windowHeight; const documentHeight = dimensions.documentHeight; const scrollStep = Math.floor(windowHeight * 0.8); // 80% of viewport const maxScrolls = Math.ceil(documentHeight / scrollStep) + 1; // Progressively scroll and check for text for (let i = 0; i < maxScrolls; i++) { // Scroll down by a portion of the viewport await page.evaluate(`window.scrollBy(0, ${scrollStep});`); await page.waitForTimeout(300); // Wait for content to load // Try searching again after scrolling const elementAfterScroll = await browser.get_locate_element_by_text(text); if (elementAfterScroll) { await elementAfterScroll.scrollIntoViewIfNeeded(); await page.waitForTimeout(500); const msg = `Scrolled to text: "${text}" (after progressive scroll)`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } // Also try XPath search const xpathElementAfterScroll = await browser.get_locate_element_by_xpath(xpathText); if (xpathElementAfterScroll) { await xpathElementAfterScroll.scrollIntoViewIfNeeded(); await page.waitForTimeout(500); const msg = `Scrolled to text: "${text}" (via XPath after progressive scroll)`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } } // Try DOM search through simple browser evaluation as last resort const foundByEvaluation = await page.evaluate(` (function() { // Simple text search in all element contents const allElements = document.querySelectorAll('*'); for (const el of allElements) { if (el.textContent && el.textContent.includes("${text.replace(/"/g, '\\"')}")) { el.scrollIntoView({ behavior: 'smooth', block: 'center' }); return true; } } return false; })() `); if (foundByEvaluation) { await page.waitForTimeout(500); const msg = `Scrolled to text: "${text}" (via DOM traversal)`; logger.debug(msg); return new ActionResult({ extractedContent: msg, includeInMemory: true }); } // Text not found in all attempts const notFoundMsg = `Text "${text}" not found on page after extensive search`; logger.debug(notFoundMsg); return new ActionResult({ extractedContent: notFoundMsg, includeInMemory: true, success: false }); } catch (error) { const errorMsg = `Error scrolling to text "${text}": ${String(error)}`; logger.debug(errorMsg); return new ActionResult({ error: errorMsg, includeInMemory: true }); } }); } /** * Decorator for registering custom actions * @param description Describe what the function does */ /** * Execute an action * @param action The action to execute * @param browserContext The browser context * @param pageExtractionLlm Optional LLM for page extraction * @param sensitiveData Optional sensitive data * @param availableFilePaths Optional available file paths * @param context Optional context */ async act(action, browserContext, pageExtractionLlm, sensitiveData, availableFilePaths, context) { try { // Get the action name and parameters from the ActionModel if (!action.actions || Object.keys(action.actions).length === 0) { return new ActionResult({ error: "No actions defined in ActionModel", success: false }); } // Get the first action name (there should only be one) const actionName = Object.keys(action.actions)[0]; // Get the parameters either from paramToJson() or directly from params const params = action.params || {}; logger.success(`๐ŸŽฎExecuting action: ${actionName} โš™๏ธ${JSON.stringify(params)}`); // Create a context object with all required parameters const contextObj = { browser: browserContext, pageExtractionLlm, sensitiveData, availableFilePaths, userContext: context }; // Pass the parameters as needed by the registry's executeAction method const result = await this.registry.executeAction(actionName, params, contextObj); if (result.error) { if (result.error.length > 0) { logger.debuggerError(`๐ŸŽฎProblem Executing ${actionName}: ${result.error}`); } } return result; } catch (e) { logger.debug(`Error in Controller.act: ${e.message}`); return new ActionResult({ error: `Error executing action: ${e.message}`, success: false }); } } } //# sourceMappingURL=controllerContext.js.map