UNPKG

@ui-tars/operator-browser

Version:
622 lines (621 loc) 27.2 kB
/** * Copyright (c) 2025 Bytedance, Inc. and its affiliates. * SPDX-License-Identifier: Apache-2.0 */ "use strict"; var __webpack_require__ = {}; (()=>{ __webpack_require__.d = (exports1, definition)=>{ for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, { enumerable: true, get: definition[key] }); }; })(); (()=>{ __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop); })(); (()=>{ __webpack_require__.r = (exports1)=>{ if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, { value: 'Module' }); Object.defineProperty(exports1, '__esModule', { value: true }); }; })(); var __webpack_exports__ = {}; __webpack_require__.r(__webpack_exports__); __webpack_require__.d(__webpack_exports__, { BrowserOperator: ()=>BrowserOperator, RemoteBrowserOperator: ()=>RemoteBrowserOperator, DefaultBrowserOperator: ()=>DefaultBrowserOperator }); const browser_namespaceObject = require("@agent-infra/browser"); const logger_namespaceObject = require("@agent-infra/logger"); const core_namespaceObject = require("@ui-tars/sdk/core"); const external_types_js_namespaceObject = require("./types.js"); const external_ui_helper_js_namespaceObject = require("./ui-helper.js"); const external_key_map_js_namespaceObject = require("./key-map.js"); const external_shortcuts_js_namespaceObject = require("./shortcuts.js"); function _define_property(obj, key, value) { if (key in obj) Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); else obj[key] = value; return obj; } class BrowserOperator extends core_namespaceObject.Operator { async getActivePage() { const page = await this.browser.getActivePage(); if (!page) throw new Error('No active page found'); if (this.currentPage !== page) this.currentPage = page; return page; } setHighlightClickableElements(enable) { this.highlightClickableElements = enable; this.logger.info(`Clickable elements highlighting ${enable ? 'enabled' : 'disabled'}`); } setShowWaterFlow(enable) { this.showWaterFlowEffect = enable; this.logger.info(`Water flow effect ${enable ? 'enabled' : 'disabled'}`); } async screenshot() { this.logger.info('Starting screenshot...'); if (this.showWaterFlowEffect) this.uiHelper.showWaterFlow(); const page = await this.getActivePage(); try { const deviceScaleFactor = await this.getDeviceScaleFactor(); this.logger.info('DeviceScaleFactor:', deviceScaleFactor); if (this.highlightClickableElements) { this.logger.info('Highlighting clickable elements...'); await this.uiHelper.highlightClickableElements(); await this.delay(300); } this.logger.info('Taking screenshot...'); const startTime = Date.now(); await this.uiHelper.cleanupTemporaryVisuals(); const buffer = await page.screenshot({ captureBeyondViewport: false, encoding: 'base64', type: 'jpeg', quality: 75, fullPage: false }); const duration = Date.now() - startTime; this.logger.info(`Screenshot taken in ${duration}ms`); const output = { base64: buffer.toString(), scaleFactor: deviceScaleFactor || 1 }; this.logger.info('Screenshot Info', { ...output, base64: '<base64>' }); try { var _this_options_onScreenshot, _this_options; await (null == (_this_options_onScreenshot = (_this_options = this.options).onScreenshot) ? void 0 : _this_options_onScreenshot.call(_this_options, output, page)); } catch (error) { this.logger.error('Error in onScreenshot callback:', error); } return output; } catch (error) { if (this.highlightClickableElements) await this.uiHelper.removeClickableHighlights(); this.logger.error('Screenshot failed:', error); throw error; } } async execute(params) { var _this_options_onOperatorAction, _this_options; this.logger.info('Starting execute with params:', params); const { parsedPrediction, screenWidth, screenHeight } = params; if (this.showActionInfo) { var _this_uiHelper; await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showActionInfo(parsedPrediction)); } await (null == (_this_options_onOperatorAction = (_this_options = this.options).onOperatorAction) ? void 0 : _this_options_onOperatorAction.call(_this_options, parsedPrediction)); const { action_type, action_inputs } = parsedPrediction; const startBoxStr = (null == action_inputs ? void 0 : action_inputs.start_box) || ''; const deviceScaleFactor = await this.getDeviceScaleFactor(); const coords = (0, core_namespaceObject.parseBoxToScreenCoords)({ boxStr: startBoxStr, screenWidth, screenHeight }); const startX = coords.x ? coords.x / deviceScaleFactor : null; const startY = coords.y ? coords.y / deviceScaleFactor : null; this.logger.info(`Parsed coordinates: (${startX}, ${startY})`); this.logger.info(`Executing action: ${action_type}`); try { await this.getActivePage(); switch(action_type){ case 'drag': await this.handleDrag(action_inputs, deviceScaleFactor, screenWidth, screenHeight); break; case 'navigate': if (!action_inputs.content) throw new Error('No target url specified for navigation'); await this.handleNavigate({ url: action_inputs.content }); break; case 'navigate_back': await this.handleNavigateBack(); break; case 'click': case 'left_click': case 'left_single': if (startX && startY) await this.handleClick(startX, startY); else throw new Error(`Missing startX(${startX}) or startY${startX}.`); break; case 'double_click': case 'left_double': if (startX && startY) await this.handleDoubleClick(startX, startY); else throw new Error(`Missing startX(${startX}) or startY${startX}.`); break; case 'right_click': if (startX && startY) await this.handleRightClick(startX, startY); else throw new Error(`Missing startX(${startX}) or startY${startX}.`); break; case 'type': await this.handleType(action_inputs); await this.delay(1000); break; case 'hotkey': await this.handleHotkey(action_inputs); break; case 'press': await this.handlePress(action_inputs); break; case 'release': await this.handleRelease(action_inputs); break; case 'scroll': await this.handleScroll(action_inputs); break; case 'wait': await this.delay(5000); break; case 'finished': if (this.options.onFinalAnswer && parsedPrediction.thought) await this.options.onFinalAnswer(parsedPrediction.thought); this.uiHelper.cleanup(); break; case 'call_user': this.uiHelper.cleanup(); break; case 'user_stop': this.uiHelper.cleanup(); break; default: this.logger.warn(`[BrowserOperator] Unsupported action: ${action_type}`); } this.logger.info(`Action ${action_type} completed successfully`); } catch (error) { this.logger.error(`Failed to execute ${action_type}:`, error); await this.cleanup(); throw error; } return { startX, startY, action_inputs }; } async handleClick(x, y) { this.logger.info(`Clicking at (${x}, ${y})`); const page = await this.getActivePage(); try { var _this_uiHelper; await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y)); await this.delay(300); await page.mouse.move(x, y); await this.delay(100); await page.mouse.click(x, y); await this.delay(800); this.logger.info('Click completed'); } catch (error) { this.logger.error('Click operation failed:', error); throw error; } } async handleDoubleClick(x, y) { this.logger.info(`Double clicking at (${x}, ${y})`); const page = await this.getActivePage(); try { var _this_uiHelper; await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y)); await this.delay(300); await page.mouse.move(x, y); await this.delay(100); await page.mouse.click(x, y, { clickCount: 2 }); await this.delay(800); this.logger.info('Double click completed'); } catch (error) { this.logger.error('Double click operation failed:', error); throw error; } } async handleRightClick(x, y) { const page = await this.getActivePage(); this.logger.info(`Right clicking at (${x}, ${y})`); try { var _this_uiHelper; await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y)); await this.delay(300); await page.mouse.move(x, y); await this.delay(100); await page.mouse.click(x, y, { button: 'right' }); await this.delay(800); this.logger.info('Right click completed'); } catch (error) { this.logger.error('Right click operation failed:', error); throw error; } } async handleType(inputs) { var _inputs_content; const page = await this.getActivePage(); const content = null == (_inputs_content = inputs.content) ? void 0 : _inputs_content.trim(); if (!content) return void this.logger.warn('No content to type'); this.logger.info('Typing content:', content); const stripContent = content.replace(/\\n$/, '').replace(/\n$/, ''); await page.keyboard.type(stripContent, { delay: 20 + 30 * Math.random() }); if (content.endsWith('\n') || content.endsWith('\\n')) { await this.delay(50); this.logger.info('Pressing Enter after content'); await page.keyboard.press('Enter'); this.logger.info('Typing completed'); await this.waitForPossibleNavigation(page); } } async handleHotkey(inputs) { const page = await this.getActivePage(); const keyStr = (null == inputs ? void 0 : inputs.key) || (null == inputs ? void 0 : inputs.hotkey); if (!keyStr) { this.logger.warn('No hotkey specified'); throw new Error("No hotkey specified"); } this.logger.info(`Executing hotkey: ${keyStr}`); const keys = keyStr.split(/[\s+]/); const normalizedKeys = keys.map((key)=>{ const lowercaseKey = key.toLowerCase(); const keyInput = external_key_map_js_namespaceObject.KEY_MAPPINGS[lowercaseKey]; if (keyInput) return keyInput; throw new Error(`Unsupported key: ${key}`); }); this.logger.info("Normalized keys:", normalizedKeys); await (0, external_shortcuts_js_namespaceObject.shortcuts)(page, normalizedKeys, this.options.browserType); const navigationKeys = [ 'Enter', 'F5' ]; if (normalizedKeys.some((key)=>navigationKeys.includes(key))) { this.logger.info('Waiting for possible navigation after hotkey'); await this.waitForPossibleNavigation(page); } else await this.delay(500); this.logger.info('Hotkey execution completed'); } async handlePress(inputs) { const page = await this.getActivePage(); const keyStr = null == inputs ? void 0 : inputs.key; if (!keyStr) { this.logger.warn('No key specified for press'); throw new Error("No key specified for press"); } this.logger.info(`Pressing key: ${keyStr}`); const keys = keyStr.split(/[\s+]/); const normalizedKeys = keys.map((key)=>{ const lowercaseKey = key.toLowerCase(); const keyInput = external_key_map_js_namespaceObject.KEY_MAPPINGS[lowercaseKey]; if (keyInput) return keyInput; throw new Error(`Unsupported key: ${key}`); }); this.logger.info("Normalized keys for press:", normalizedKeys); for (const key of normalizedKeys){ await page.keyboard.down(key); await this.delay(50); } this.logger.info('Press operation completed'); } async handleRelease(inputs) { const page = await this.getActivePage(); const keyStr = null == inputs ? void 0 : inputs.key; if (!keyStr) { this.logger.warn('No key specified for release'); throw new Error("No key specified for release"); } this.logger.info(`Releasing key: ${keyStr}`); const keys = keyStr.split(/[\s+]/); const normalizedKeys = keys.map((key)=>{ const lowercaseKey = key.toLowerCase(); const keyInput = external_key_map_js_namespaceObject.KEY_MAPPINGS[lowercaseKey]; if (keyInput) return keyInput; throw new Error(`Unsupported key: ${key}`); }); this.logger.info("Normalized keys for release:", normalizedKeys); for (const key of normalizedKeys){ await page.keyboard.up(key); await this.delay(50); } const navigationKeys = [ 'Enter', 'F5' ]; if (normalizedKeys.some((key)=>navigationKeys.includes(key))) { this.logger.info('Waiting for possible navigation after key release'); await this.waitForPossibleNavigation(page); } else await this.delay(500); this.logger.info('Release operation completed'); } async handleScroll(inputs) { const page = await this.getActivePage(); const { direction } = inputs; const scrollAmount = 500; this.logger.info(`Scrolling ${direction} by ${scrollAmount}px`); switch(null == direction ? void 0 : direction.toLowerCase()){ case 'up': await page.mouse.wheel({ deltaY: -scrollAmount }); break; case 'down': await page.mouse.wheel({ deltaY: scrollAmount }); break; default: this.logger.warn(`Unsupported scroll direction: ${direction}`); return; } this.logger.info('Scroll completed'); } async delay(ms) { return new Promise((resolve)=>setTimeout(resolve, ms)); } async handleNavigate(inputs) { const page = await this.getActivePage(); let { url } = inputs; if (!/^https?:\/\//i.test(url)) url = 'https://' + url; this.logger.info(`Navigating to: ${url}`); await page.goto(url, { waitUntil: [] }); this.logger.info('Navigation completed'); } async handleDrag(inputs, deviceScaleFactor, screenWidth, screenHeight) { const page = await this.getActivePage(); const startBoxStr = inputs.start_box || ''; const endBoxStr = inputs.end_box || ''; if (!startBoxStr || !endBoxStr) throw new Error('Missing start_point or end_point for drag operation'); const startCoords = (0, core_namespaceObject.parseBoxToScreenCoords)({ boxStr: startBoxStr, screenWidth, screenHeight }); const endCoords = (0, core_namespaceObject.parseBoxToScreenCoords)({ boxStr: endBoxStr, screenWidth, screenHeight }); const startX = startCoords.x ? startCoords.x / deviceScaleFactor : null; const startY = startCoords.y ? startCoords.y / deviceScaleFactor : null; const endX = endCoords.x ? endCoords.x / deviceScaleFactor : null; const endY = endCoords.y ? endCoords.y / deviceScaleFactor : null; if (!startX || !startY || !endX || !endY) throw new Error('Invalid coordinates for drag operation'); this.logger.info(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`); try { var _this_uiHelper; await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showDragIndicator(startX, startY, endX, endY)); await this.delay(300); await page.mouse.move(startX, startY); await this.delay(100); await page.mouse.down(); const steps = 10; for(let i = 1; i <= steps; i++){ const stepX = startX + (endX - startX) * i / steps; const stepY = startY + (endY - startY) * i / steps; await page.mouse.move(stepX, stepY); await this.delay(30); } await this.delay(100); await page.mouse.up(); await this.delay(800); this.logger.info('Drag completed'); } catch (error) { this.logger.error('Drag operation failed:', error); throw error; } } async handleNavigateBack() { const page = await this.getActivePage(); this.logger.info("handleNavigateBack"); await page.goBack(); this.logger.info('handleNavigateBack completed'); } async waitForPossibleNavigation(page) { const navigationPromise = new Promise((resolve)=>{ const onStarted = ()=>{ this.logger.info('Navigation started'); resolve(); page.off('framenavigated', onStarted); }; page.on('framenavigated', onStarted); setTimeout(()=>{ page.off('framenavigated', onStarted); resolve(); }, 5000); }); await navigationPromise; this.logger.info('Navigation completed or timed out'); } async getDeviceScaleFactor() { var _page_viewport; if (this.deviceScaleFactor) return this.deviceScaleFactor; this.logger.info('Getting deviceScaleFactor info...'); const page = await this.getActivePage(); const scaleFactor = null == (_page_viewport = page.viewport()) ? void 0 : _page_viewport.deviceScaleFactor; if (scaleFactor) { this.deviceScaleFactor = scaleFactor; return scaleFactor; } const devicePixelRatio = await page.evaluate(()=>window.devicePixelRatio); if (devicePixelRatio) { this.deviceScaleFactor = devicePixelRatio; return devicePixelRatio; } throw Error('Get deviceScaleFactor failed.'); } async cleanup() { this.logger.info('Starting cleanup...'); await this.uiHelper.cleanup(); if (this.currentPage) { await this.currentPage.close(); this.currentPage = null; this.logger.info('Page closed successfully'); } this.logger.info('Cleanup completed'); } constructor(options){ super(), _define_property(this, "options", void 0), _define_property(this, "browser", void 0), _define_property(this, "currentPage", void 0), _define_property(this, "logger", void 0), _define_property(this, "uiHelper", void 0), _define_property(this, "highlightClickableElements", void 0), _define_property(this, "showActionInfo", void 0), _define_property(this, "showWaterFlowEffect", void 0), _define_property(this, "deviceScaleFactor", void 0), this.options = options, this.currentPage = null, this.highlightClickableElements = true, this.showActionInfo = true, this.showWaterFlowEffect = true; this.browser = this.options.browser; this.logger = (this.options.logger ?? logger_namespaceObject.defaultLogger).spawn('[BrowserOperator]'); this.uiHelper = new external_ui_helper_js_namespaceObject.UIHelper(()=>this.getActivePage(), this.logger); if (false === options.highlightClickableElements) this.highlightClickableElements = false; if (false === options.showActionInfo) this.showActionInfo = false; if (false === options.showWaterFlow) this.showWaterFlowEffect = false; } } class DefaultBrowserOperator extends BrowserOperator { static hasBrowser(browser) { try { if (this.browserPath) return true; if (!this.logger) this.logger = new logger_namespaceObject.ConsoleLogger('[DefaultBrowserOperator]'); const browserFinder = new browser_namespaceObject.BrowserFinder(this.logger); const browserData = browserFinder.findBrowser(browser); this.browserPath = browserData.path; this.browserType = browserData.type; return true; } catch (error) { if (this.logger) this.logger.error('No available browser found:', error); return false; } } static async getInstance(highlight = false, showActionInfo = false, showWaterFlow = false, isCallUser = false, searchEngine = 'google') { if (!this.logger) this.logger = new logger_namespaceObject.ConsoleLogger('[DefaultBrowserOperator]'); if (this.browser) { const isAlive = await this.browser.isBrowserAlive(); if (!isAlive) { this.browser = null; this.instance = null; } } if (!this.browser) { this.browser = new browser_namespaceObject.LocalBrowser({ logger: this.logger }); await this.browser.launch({ executablePath: this.browserPath, browserType: this.browserType }); } if (!this.instance) this.instance = new DefaultBrowserOperator({ browser: this.browser, browserType: this.browserType, logger: this.logger, highlightClickableElements: highlight, showActionInfo: showActionInfo, showWaterFlow: showWaterFlow }); if (!isCallUser) { var _this_browser; const openingPage = await (null == (_this_browser = this.browser) ? void 0 : _this_browser.createPage()); const searchEngineUrls = { [external_types_js_namespaceObject.SearchEngine.GOOGLE]: 'https://www.google.com/', [external_types_js_namespaceObject.SearchEngine.BING]: 'https://www.bing.com/', [external_types_js_namespaceObject.SearchEngine.BAIDU]: 'https://www.baidu.com/' }; const targetUrl = searchEngineUrls[searchEngine]; await (null == openingPage ? void 0 : openingPage.goto(targetUrl, { waitUntil: 'networkidle2' })); } this.instance.setHighlightClickableElements(highlight); return this.instance; } static async destroyInstance() { if (this.instance) { await this.instance.cleanup(); if (this.browser) { await this.browser.close(); this.browser = null; } this.instance = null; } } constructor(options){ super(options); } } _define_property(DefaultBrowserOperator, "instance", null); _define_property(DefaultBrowserOperator, "browser", null); _define_property(DefaultBrowserOperator, "browserPath", void 0); _define_property(DefaultBrowserOperator, "browserType", void 0); _define_property(DefaultBrowserOperator, "logger", null); class RemoteBrowserOperator extends BrowserOperator { static async getInstance(wsEndpoint, highlight = false, showActionInfo = false, showWaterFlow = false, isCallUser = false) { if (!this.logger) this.logger = new logger_namespaceObject.ConsoleLogger('[RemoteBrowserOperator]'); this.browser = new browser_namespaceObject.RemoteBrowser({ wsEndpoint: wsEndpoint, logger: this.logger }); await this.browser.launch(); this.instance = new RemoteBrowserOperator({ browser: this.browser, browserType: this.browserType, logger: this.logger, highlightClickableElements: highlight, showActionInfo: showActionInfo, showWaterFlow: showWaterFlow }); this.instance.setHighlightClickableElements(highlight); return this.instance; } static async destroyInstance() { if (this.instance) { await this.instance.cleanup(); if (this.browser) { await this.browser.close(); this.browser = null; } this.instance = null; } } constructor(options){ super(options); } } _define_property(RemoteBrowserOperator, "instance", null); _define_property(RemoteBrowserOperator, "browser", null); _define_property(RemoteBrowserOperator, "browserType", void 0); _define_property(RemoteBrowserOperator, "logger", null); exports.BrowserOperator = __webpack_exports__.BrowserOperator; exports.DefaultBrowserOperator = __webpack_exports__.DefaultBrowserOperator; exports.RemoteBrowserOperator = __webpack_exports__.RemoteBrowserOperator; for(var __webpack_i__ in __webpack_exports__)if (-1 === [ "BrowserOperator", "DefaultBrowserOperator", "RemoteBrowserOperator" ].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__]; Object.defineProperty(exports, '__esModule', { value: true }); //# sourceMappingURL=browser-operator.js.map