@ui-tars/operator-browser
Version:
Native-browser operator for UI-TARS
582 lines (581 loc) • 25.2 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { BrowserFinder, LocalBrowser, RemoteBrowser } from "@agent-infra/browser";
import { ConsoleLogger, defaultLogger } from "@agent-infra/logger";
import { Operator, parseBoxToScreenCoords } from "@ui-tars/sdk/core";
import { SearchEngine } from "./types.mjs";
import { UIHelper } from "./ui-helper.mjs";
import { KEY_MAPPINGS } from "./key-map.mjs";
import { shortcuts } from "./shortcuts.mjs";
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
class BrowserOperator extends Operator {
async getActivePage() {
const page = await this.browser.getActivePage();
if (!page) throw new Error('No active page found');
if (this.currentPage !== page) this.currentPage = page;
return page;
}
setHighlightClickableElements(enable) {
this.highlightClickableElements = enable;
this.logger.info(`Clickable elements highlighting ${enable ? 'enabled' : 'disabled'}`);
}
setShowWaterFlow(enable) {
this.showWaterFlowEffect = enable;
this.logger.info(`Water flow effect ${enable ? 'enabled' : 'disabled'}`);
}
async screenshot() {
this.logger.info('Starting screenshot...');
if (this.showWaterFlowEffect) this.uiHelper.showWaterFlow();
const page = await this.getActivePage();
try {
const deviceScaleFactor = await this.getDeviceScaleFactor();
this.logger.info('DeviceScaleFactor:', deviceScaleFactor);
if (this.highlightClickableElements) {
this.logger.info('Highlighting clickable elements...');
await this.uiHelper.highlightClickableElements();
await this.delay(300);
}
this.logger.info('Taking screenshot...');
const startTime = Date.now();
await this.uiHelper.cleanupTemporaryVisuals();
const buffer = await page.screenshot({
captureBeyondViewport: false,
encoding: 'base64',
type: 'jpeg',
quality: 75,
fullPage: false
});
const duration = Date.now() - startTime;
this.logger.info(`Screenshot taken in ${duration}ms`);
const output = {
base64: buffer.toString(),
scaleFactor: deviceScaleFactor || 1
};
this.logger.info('Screenshot Info', {
...output,
base64: '<base64>'
});
try {
var _this_options_onScreenshot, _this_options;
await (null == (_this_options_onScreenshot = (_this_options = this.options).onScreenshot) ? void 0 : _this_options_onScreenshot.call(_this_options, output, page));
} catch (error) {
this.logger.error('Error in onScreenshot callback:', error);
}
return output;
} catch (error) {
if (this.highlightClickableElements) await this.uiHelper.removeClickableHighlights();
this.logger.error('Screenshot failed:', error);
throw error;
}
}
async execute(params) {
var _this_options_onOperatorAction, _this_options;
this.logger.info('Starting execute with params:', params);
const { parsedPrediction, screenWidth, screenHeight } = params;
if (this.showActionInfo) {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showActionInfo(parsedPrediction));
}
await (null == (_this_options_onOperatorAction = (_this_options = this.options).onOperatorAction) ? void 0 : _this_options_onOperatorAction.call(_this_options, parsedPrediction));
const { action_type, action_inputs } = parsedPrediction;
const startBoxStr = (null == action_inputs ? void 0 : action_inputs.start_box) || '';
const deviceScaleFactor = await this.getDeviceScaleFactor();
const coords = parseBoxToScreenCoords({
boxStr: startBoxStr,
screenWidth,
screenHeight
});
const startX = coords.x ? coords.x / deviceScaleFactor : null;
const startY = coords.y ? coords.y / deviceScaleFactor : null;
this.logger.info(`Parsed coordinates: (${startX}, ${startY})`);
this.logger.info(`Executing action: ${action_type}`);
try {
await this.getActivePage();
switch(action_type){
case 'drag':
await this.handleDrag(action_inputs, deviceScaleFactor, screenWidth, screenHeight);
break;
case 'navigate':
if (!action_inputs.content) throw new Error('No target url specified for navigation');
await this.handleNavigate({
url: action_inputs.content
});
break;
case 'navigate_back':
await this.handleNavigateBack();
break;
case 'click':
case 'left_click':
case 'left_single':
if (startX && startY) await this.handleClick(startX, startY);
else throw new Error(`Missing startX(${startX}) or startY${startX}.`);
break;
case 'double_click':
case 'left_double':
if (startX && startY) await this.handleDoubleClick(startX, startY);
else throw new Error(`Missing startX(${startX}) or startY${startX}.`);
break;
case 'right_click':
if (startX && startY) await this.handleRightClick(startX, startY);
else throw new Error(`Missing startX(${startX}) or startY${startX}.`);
break;
case 'type':
await this.handleType(action_inputs);
await this.delay(1000);
break;
case 'hotkey':
await this.handleHotkey(action_inputs);
break;
case 'press':
await this.handlePress(action_inputs);
break;
case 'release':
await this.handleRelease(action_inputs);
break;
case 'scroll':
await this.handleScroll(action_inputs);
break;
case 'wait':
await this.delay(5000);
break;
case 'finished':
if (this.options.onFinalAnswer && parsedPrediction.thought) await this.options.onFinalAnswer(parsedPrediction.thought);
this.uiHelper.cleanup();
break;
case 'call_user':
this.uiHelper.cleanup();
break;
case 'user_stop':
this.uiHelper.cleanup();
break;
default:
this.logger.warn(`[BrowserOperator] Unsupported action: ${action_type}`);
}
this.logger.info(`Action ${action_type} completed successfully`);
} catch (error) {
this.logger.error(`Failed to execute ${action_type}:`, error);
await this.cleanup();
throw error;
}
return {
startX,
startY,
action_inputs
};
}
async handleClick(x, y) {
this.logger.info(`Clicking at (${x}, ${y})`);
const page = await this.getActivePage();
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y));
await this.delay(300);
await page.mouse.move(x, y);
await this.delay(100);
await page.mouse.click(x, y);
await this.delay(800);
this.logger.info('Click completed');
} catch (error) {
this.logger.error('Click operation failed:', error);
throw error;
}
}
async handleDoubleClick(x, y) {
this.logger.info(`Double clicking at (${x}, ${y})`);
const page = await this.getActivePage();
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y));
await this.delay(300);
await page.mouse.move(x, y);
await this.delay(100);
await page.mouse.click(x, y, {
clickCount: 2
});
await this.delay(800);
this.logger.info('Double click completed');
} catch (error) {
this.logger.error('Double click operation failed:', error);
throw error;
}
}
async handleRightClick(x, y) {
const page = await this.getActivePage();
this.logger.info(`Right clicking at (${x}, ${y})`);
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y));
await this.delay(300);
await page.mouse.move(x, y);
await this.delay(100);
await page.mouse.click(x, y, {
button: 'right'
});
await this.delay(800);
this.logger.info('Right click completed');
} catch (error) {
this.logger.error('Right click operation failed:', error);
throw error;
}
}
async handleType(inputs) {
var _inputs_content;
const page = await this.getActivePage();
const content = null == (_inputs_content = inputs.content) ? void 0 : _inputs_content.trim();
if (!content) return void this.logger.warn('No content to type');
this.logger.info('Typing content:', content);
const stripContent = content.replace(/\\n$/, '').replace(/\n$/, '');
await page.keyboard.type(stripContent, {
delay: 20 + 30 * Math.random()
});
if (content.endsWith('\n') || content.endsWith('\\n')) {
await this.delay(50);
this.logger.info('Pressing Enter after content');
await page.keyboard.press('Enter');
this.logger.info('Typing completed');
await this.waitForPossibleNavigation(page);
}
}
async handleHotkey(inputs) {
const page = await this.getActivePage();
const keyStr = (null == inputs ? void 0 : inputs.key) || (null == inputs ? void 0 : inputs.hotkey);
if (!keyStr) {
this.logger.warn('No hotkey specified');
throw new Error("No hotkey specified");
}
this.logger.info(`Executing hotkey: ${keyStr}`);
const keys = keyStr.split(/[\s+]/);
const normalizedKeys = keys.map((key)=>{
const lowercaseKey = key.toLowerCase();
const keyInput = KEY_MAPPINGS[lowercaseKey];
if (keyInput) return keyInput;
throw new Error(`Unsupported key: ${key}`);
});
this.logger.info("Normalized keys:", normalizedKeys);
await shortcuts(page, normalizedKeys, this.options.browserType);
const navigationKeys = [
'Enter',
'F5'
];
if (normalizedKeys.some((key)=>navigationKeys.includes(key))) {
this.logger.info('Waiting for possible navigation after hotkey');
await this.waitForPossibleNavigation(page);
} else await this.delay(500);
this.logger.info('Hotkey execution completed');
}
async handlePress(inputs) {
const page = await this.getActivePage();
const keyStr = null == inputs ? void 0 : inputs.key;
if (!keyStr) {
this.logger.warn('No key specified for press');
throw new Error("No key specified for press");
}
this.logger.info(`Pressing key: ${keyStr}`);
const keys = keyStr.split(/[\s+]/);
const normalizedKeys = keys.map((key)=>{
const lowercaseKey = key.toLowerCase();
const keyInput = KEY_MAPPINGS[lowercaseKey];
if (keyInput) return keyInput;
throw new Error(`Unsupported key: ${key}`);
});
this.logger.info("Normalized keys for press:", normalizedKeys);
for (const key of normalizedKeys){
await page.keyboard.down(key);
await this.delay(50);
}
this.logger.info('Press operation completed');
}
async handleRelease(inputs) {
const page = await this.getActivePage();
const keyStr = null == inputs ? void 0 : inputs.key;
if (!keyStr) {
this.logger.warn('No key specified for release');
throw new Error("No key specified for release");
}
this.logger.info(`Releasing key: ${keyStr}`);
const keys = keyStr.split(/[\s+]/);
const normalizedKeys = keys.map((key)=>{
const lowercaseKey = key.toLowerCase();
const keyInput = KEY_MAPPINGS[lowercaseKey];
if (keyInput) return keyInput;
throw new Error(`Unsupported key: ${key}`);
});
this.logger.info("Normalized keys for release:", normalizedKeys);
for (const key of normalizedKeys){
await page.keyboard.up(key);
await this.delay(50);
}
const navigationKeys = [
'Enter',
'F5'
];
if (normalizedKeys.some((key)=>navigationKeys.includes(key))) {
this.logger.info('Waiting for possible navigation after key release');
await this.waitForPossibleNavigation(page);
} else await this.delay(500);
this.logger.info('Release operation completed');
}
async handleScroll(inputs) {
const page = await this.getActivePage();
const { direction } = inputs;
const scrollAmount = 500;
this.logger.info(`Scrolling ${direction} by ${scrollAmount}px`);
switch(null == direction ? void 0 : direction.toLowerCase()){
case 'up':
await page.mouse.wheel({
deltaY: -scrollAmount
});
break;
case 'down':
await page.mouse.wheel({
deltaY: scrollAmount
});
break;
default:
this.logger.warn(`Unsupported scroll direction: ${direction}`);
return;
}
this.logger.info('Scroll completed');
}
async delay(ms) {
return new Promise((resolve)=>setTimeout(resolve, ms));
}
async handleNavigate(inputs) {
const page = await this.getActivePage();
let { url } = inputs;
if (!/^https?:\/\//i.test(url)) url = 'https://' + url;
this.logger.info(`Navigating to: ${url}`);
await page.goto(url, {
waitUntil: []
});
this.logger.info('Navigation completed');
}
async handleDrag(inputs, deviceScaleFactor, screenWidth, screenHeight) {
const page = await this.getActivePage();
const startBoxStr = inputs.start_box || '';
const endBoxStr = inputs.end_box || '';
if (!startBoxStr || !endBoxStr) throw new Error('Missing start_point or end_point for drag operation');
const startCoords = parseBoxToScreenCoords({
boxStr: startBoxStr,
screenWidth,
screenHeight
});
const endCoords = parseBoxToScreenCoords({
boxStr: endBoxStr,
screenWidth,
screenHeight
});
const startX = startCoords.x ? startCoords.x / deviceScaleFactor : null;
const startY = startCoords.y ? startCoords.y / deviceScaleFactor : null;
const endX = endCoords.x ? endCoords.x / deviceScaleFactor : null;
const endY = endCoords.y ? endCoords.y / deviceScaleFactor : null;
if (!startX || !startY || !endX || !endY) throw new Error('Invalid coordinates for drag operation');
this.logger.info(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`);
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showDragIndicator(startX, startY, endX, endY));
await this.delay(300);
await page.mouse.move(startX, startY);
await this.delay(100);
await page.mouse.down();
const steps = 10;
for(let i = 1; i <= steps; i++){
const stepX = startX + (endX - startX) * i / steps;
const stepY = startY + (endY - startY) * i / steps;
await page.mouse.move(stepX, stepY);
await this.delay(30);
}
await this.delay(100);
await page.mouse.up();
await this.delay(800);
this.logger.info('Drag completed');
} catch (error) {
this.logger.error('Drag operation failed:', error);
throw error;
}
}
async handleNavigateBack() {
const page = await this.getActivePage();
this.logger.info("handleNavigateBack");
await page.goBack();
this.logger.info('handleNavigateBack completed');
}
async waitForPossibleNavigation(page) {
const navigationPromise = new Promise((resolve)=>{
const onStarted = ()=>{
this.logger.info('Navigation started');
resolve();
page.off('framenavigated', onStarted);
};
page.on('framenavigated', onStarted);
setTimeout(()=>{
page.off('framenavigated', onStarted);
resolve();
}, 5000);
});
await navigationPromise;
this.logger.info('Navigation completed or timed out');
}
async getDeviceScaleFactor() {
var _page_viewport;
if (this.deviceScaleFactor) return this.deviceScaleFactor;
this.logger.info('Getting deviceScaleFactor info...');
const page = await this.getActivePage();
const scaleFactor = null == (_page_viewport = page.viewport()) ? void 0 : _page_viewport.deviceScaleFactor;
if (scaleFactor) {
this.deviceScaleFactor = scaleFactor;
return scaleFactor;
}
const devicePixelRatio = await page.evaluate(()=>window.devicePixelRatio);
if (devicePixelRatio) {
this.deviceScaleFactor = devicePixelRatio;
return devicePixelRatio;
}
throw Error('Get deviceScaleFactor failed.');
}
async cleanup() {
this.logger.info('Starting cleanup...');
await this.uiHelper.cleanup();
if (this.currentPage) {
await this.currentPage.close();
this.currentPage = null;
this.logger.info('Page closed successfully');
}
this.logger.info('Cleanup completed');
}
constructor(options){
super(), _define_property(this, "options", void 0), _define_property(this, "browser", void 0), _define_property(this, "currentPage", void 0), _define_property(this, "logger", void 0), _define_property(this, "uiHelper", void 0), _define_property(this, "highlightClickableElements", void 0), _define_property(this, "showActionInfo", void 0), _define_property(this, "showWaterFlowEffect", void 0), _define_property(this, "deviceScaleFactor", void 0), this.options = options, this.currentPage = null, this.highlightClickableElements = true, this.showActionInfo = true, this.showWaterFlowEffect = true;
this.browser = this.options.browser;
this.logger = (this.options.logger ?? defaultLogger).spawn('[BrowserOperator]');
this.uiHelper = new UIHelper(()=>this.getActivePage(), this.logger);
if (false === options.highlightClickableElements) this.highlightClickableElements = false;
if (false === options.showActionInfo) this.showActionInfo = false;
if (false === options.showWaterFlow) this.showWaterFlowEffect = false;
}
}
class DefaultBrowserOperator extends BrowserOperator {
static hasBrowser(browser) {
try {
if (this.browserPath) return true;
if (!this.logger) this.logger = new ConsoleLogger('[DefaultBrowserOperator]');
const browserFinder = new BrowserFinder(this.logger);
const browserData = browserFinder.findBrowser(browser);
this.browserPath = browserData.path;
this.browserType = browserData.type;
return true;
} catch (error) {
if (this.logger) this.logger.error('No available browser found:', error);
return false;
}
}
static async getInstance(highlight = false, showActionInfo = false, showWaterFlow = false, isCallUser = false, searchEngine = 'google') {
if (!this.logger) this.logger = new ConsoleLogger('[DefaultBrowserOperator]');
if (this.browser) {
const isAlive = await this.browser.isBrowserAlive();
if (!isAlive) {
this.browser = null;
this.instance = null;
}
}
if (!this.browser) {
this.browser = new LocalBrowser({
logger: this.logger
});
await this.browser.launch({
executablePath: this.browserPath,
browserType: this.browserType
});
}
if (!this.instance) this.instance = new DefaultBrowserOperator({
browser: this.browser,
browserType: this.browserType,
logger: this.logger,
highlightClickableElements: highlight,
showActionInfo: showActionInfo,
showWaterFlow: showWaterFlow
});
if (!isCallUser) {
var _this_browser;
const openingPage = await (null == (_this_browser = this.browser) ? void 0 : _this_browser.createPage());
const searchEngineUrls = {
[SearchEngine.GOOGLE]: 'https://www.google.com/',
[SearchEngine.BING]: 'https://www.bing.com/',
[SearchEngine.BAIDU]: 'https://www.baidu.com/'
};
const targetUrl = searchEngineUrls[searchEngine];
await (null == openingPage ? void 0 : openingPage.goto(targetUrl, {
waitUntil: 'networkidle2'
}));
}
this.instance.setHighlightClickableElements(highlight);
return this.instance;
}
static async destroyInstance() {
if (this.instance) {
await this.instance.cleanup();
if (this.browser) {
await this.browser.close();
this.browser = null;
}
this.instance = null;
}
}
constructor(options){
super(options);
}
}
_define_property(DefaultBrowserOperator, "instance", null);
_define_property(DefaultBrowserOperator, "browser", null);
_define_property(DefaultBrowserOperator, "browserPath", void 0);
_define_property(DefaultBrowserOperator, "browserType", void 0);
_define_property(DefaultBrowserOperator, "logger", null);
class RemoteBrowserOperator extends BrowserOperator {
static async getInstance(wsEndpoint, highlight = false, showActionInfo = false, showWaterFlow = false, isCallUser = false) {
if (!this.logger) this.logger = new ConsoleLogger('[RemoteBrowserOperator]');
this.browser = new RemoteBrowser({
wsEndpoint: wsEndpoint,
logger: this.logger
});
await this.browser.launch();
this.instance = new RemoteBrowserOperator({
browser: this.browser,
browserType: this.browserType,
logger: this.logger,
highlightClickableElements: highlight,
showActionInfo: showActionInfo,
showWaterFlow: showWaterFlow
});
this.instance.setHighlightClickableElements(highlight);
return this.instance;
}
static async destroyInstance() {
if (this.instance) {
await this.instance.cleanup();
if (this.browser) {
await this.browser.close();
this.browser = null;
}
this.instance = null;
}
}
constructor(options){
super(options);
}
}
_define_property(RemoteBrowserOperator, "instance", null);
_define_property(RemoteBrowserOperator, "browser", null);
_define_property(RemoteBrowserOperator, "browserType", void 0);
_define_property(RemoteBrowserOperator, "logger", null);
export { BrowserOperator, DefaultBrowserOperator, RemoteBrowserOperator };
//# sourceMappingURL=browser-operator.mjs.map