@gui-agent/operator-browser
Version:
Native-browser operator for GUI Agent
572 lines (571 loc) • 24.2 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
"use strict";
var __webpack_require__ = {};
(()=>{
__webpack_require__.d = (exports1, definition)=>{
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
enumerable: true,
get: definition[key]
});
};
})();
(()=>{
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
})();
(()=>{
__webpack_require__.r = (exports1)=>{
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
value: 'Module'
});
Object.defineProperty(exports1, '__esModule', {
value: true
});
};
})();
var __webpack_exports__ = {};
__webpack_require__.r(__webpack_exports__);
__webpack_require__.d(__webpack_exports__, {
BrowserOperator: ()=>BrowserOperator
});
const logger_namespaceObject = require("@agent-infra/logger");
const puppeteer_enhance_namespaceObject = require("@agent-infra/puppeteer-enhance");
const base_namespaceObject = require("@gui-agent/shared/base");
const utils_namespaceObject = require("@gui-agent/shared/utils");
const external_ui_helper_js_namespaceObject = require("./ui-helper.js");
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
class BrowserOperator extends base_namespaceObject.Operator {
setShowActionInfo(enable) {
this.showActionInfo = enable;
this.logger.info(`Show Action info ${enable ? 'enabled' : 'disabled'}`);
}
setShowWaterFlow(enable) {
this.showWaterFlowEffect = enable;
this.logger.info(`Water flow effect ${enable ? 'enabled' : 'disabled'}`);
}
setHighlightClickableElements(enable) {
this.highlightClickableElements = enable;
this.logger.info(`Clickable elements highlighting ${enable ? 'enabled' : 'disabled'}`);
}
async cleanup() {
this.logger.info('Starting cleanup...');
await this.uiHelper.cleanup();
if (this.currentPage) {
await this.currentPage.close();
this.currentPage = null;
this.logger.info('Page closed successfully');
}
this.logger.info('Cleanup completed');
}
async destroyInstance() {
this.logger.debug('destroyInstance: start');
await this.cleanup();
if (this.browser) await this.browser.close();
}
async initialize() {
this.logger.info('initialize: getting screen context info...');
const { width, height } = await this.getScreenRect();
const scaleFactor = await this.getDeviceScaleFactor();
this.currentScreenContext = {
screenWidth: width,
screenHeight: height,
scaleX: scaleFactor ?? 1,
scaleY: scaleFactor ?? 1
};
}
supportedActions() {
return [
'drag',
'navigate',
'navigate_back',
'click',
'double_click',
'right_click',
'type',
'hotkey',
'press',
'release',
'scroll',
'wait',
'finished',
'call_user'
];
}
screenContext() {
if (this.currentScreenContext) return this.currentScreenContext;
throw Error('Get screenContext failed.');
}
async screenshot() {
this.logger.info('Starting screenshot...');
if (this.showWaterFlowEffect) await this.uiHelper.showWaterFlow();
const page = await this.getActivePage();
try {
if (this.highlightClickableElements) {
this.logger.info('Highlighting clickable elements...');
await this.uiHelper.highlightClickableElements();
await (0, utils_namespaceObject.sleep)(300);
}
const startTime = Date.now();
await this.uiHelper.cleanupTemporaryVisuals();
const buffer = await page.screenshot({
captureBeyondViewport: false,
encoding: 'base64',
type: 'jpeg',
quality: 75,
fullPage: false
});
const duration = Date.now() - startTime;
this.logger.info(`Screenshot taken in ${duration}ms`);
const output = {
status: 'success',
base64: buffer.toString(),
url: (await this.getMeta()).url
};
this.logger.info('Screenshot Info', {
...output,
base64: '<base64>'
});
return output;
} catch (error) {
this.logger.error('Screenshot failed:', error);
throw error;
} finally{
if (this.highlightClickableElements) await this.uiHelper.removeClickableHighlights();
if (this.showWaterFlowEffect) await this.uiHelper.hideWaterFlow();
}
}
async execute(params) {
const { actions, reasoningContent } = params;
for (const action of actions){
if (this.showActionInfo) {
var _this_uiHelper;
this.logger.info('Show action info');
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showActionInfo(action, reasoningContent ?? ''));
}
this.logger.info('Execute action', action);
await this.singleActionExecutor(action);
}
return {
status: 'success'
};
}
async singleActionExecutor(action) {
this.logger.info('Starting execute with action:', JSON.stringify(action));
const { type: action_type, inputs: action_inputs } = action;
this.logger.info(`Executing action: ${action_type}`);
try {
await this.getActivePage();
switch(action_type){
case 'drag':
await this.handleDrag(action_inputs);
break;
case 'navigate':
await this.handleNavigate(action_inputs);
break;
case 'navigate_back':
await this.handleNavigateBack();
break;
case 'click':
case 'left_click':
case 'left_single':
await this.handleClick(action_inputs);
break;
case 'double_click':
case 'left_double':
await this.handleDoubleClick(action_inputs);
break;
case 'right_click':
await this.handleRightClick(action_inputs);
break;
case 'type':
await this.handleType(action_inputs);
await (0, utils_namespaceObject.sleep)(1000);
break;
case 'hotkey':
await this.handleHotkey(action_inputs);
break;
case 'press':
await this.handlePress(action_inputs);
break;
case 'release':
await this.handleRelease(action_inputs);
break;
case 'scroll':
await this.handleScroll(action_inputs);
break;
case 'wait':
await (0, utils_namespaceObject.sleep)(1000 * action_inputs.time || 5000);
break;
case 'finished':
this.uiHelper.cleanup();
break;
case 'call_user':
this.uiHelper.cleanup();
break;
default:
this.logger.warn(`Unsupported action: ${action_type}`);
}
this.logger.info(`Action ${action_type} completed successfully`);
} catch (error) {
this.logger.error(`Failed to execute ${action_type}:`, error);
await this.cleanup();
throw error;
}
return {
status: 'success'
};
}
async handleClick(inputs) {
if (!inputs.point) throw new Error("Missing point for click.");
const { realX: x, realY: y } = await this.calculateRealCoords(inputs.point);
this.logger.info(`Clicking at (${x}, ${y})`);
const page = await this.getActivePage();
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y));
await (0, utils_namespaceObject.sleep)(300);
await page.mouse.move(x, y);
await (0, utils_namespaceObject.sleep)(100);
await page.mouse.click(x, y);
await (0, utils_namespaceObject.sleep)(800);
this.logger.info('Click completed');
} catch (error) {
this.logger.error('Click operation failed:', error);
throw error;
}
}
async handleDoubleClick(inputs) {
if (!inputs.point) throw new Error("Missing point for double click.");
const { realX: x, realY: y } = await this.calculateRealCoords(inputs.point);
this.logger.info(`Double clicking at (${x}, ${y})`);
const page = await this.getActivePage();
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y));
await (0, utils_namespaceObject.sleep)(300);
await page.mouse.move(x, y);
await (0, utils_namespaceObject.sleep)(100);
await page.mouse.click(x, y, {
clickCount: 2
});
await (0, utils_namespaceObject.sleep)(800);
this.logger.info('Double click completed');
} catch (error) {
this.logger.error('Double click operation failed:', error);
throw error;
}
}
async handleRightClick(inputs) {
if (!inputs.point) throw new Error("Missing point for right click.");
const { realX: x, realY: y } = await this.calculateRealCoords(inputs.point);
this.logger.info(`Right clicking at (${x}, ${y})`);
const page = await this.getActivePage();
try {
var _this_uiHelper;
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showClickIndicator(x, y));
await (0, utils_namespaceObject.sleep)(300);
await page.mouse.move(x, y);
await (0, utils_namespaceObject.sleep)(100);
await page.mouse.click(x, y, {
button: 'right'
});
await (0, utils_namespaceObject.sleep)(800);
this.logger.info('Right click completed');
} catch (error) {
this.logger.error('Right click operation failed:', error);
throw error;
}
}
async handleType(inputs) {
var _inputs_content;
const page = await this.getActivePage();
const content = null == (_inputs_content = inputs.content) ? void 0 : _inputs_content.trim();
if (!content) return void this.logger.warn('No content to type');
this.logger.info('Typing content:', content);
const stripContent = content.replace(/\\n$/, '').replace(/\n$/, '');
await page.keyboard.type(stripContent, {
delay: 20 + 30 * Math.random()
});
if (content.endsWith('\n') || content.endsWith('\\n')) {
await (0, utils_namespaceObject.sleep)(50);
this.logger.info('Pressing Enter after content');
await page.keyboard.press('Enter');
this.logger.info('Typing completed');
await this.waitForPossibleNavigation(page);
}
}
async handleHotkey(inputs) {
const page = await this.getActivePage();
const keyStr = (null == inputs ? void 0 : inputs.key) || (null == inputs ? void 0 : inputs.hotkey);
if (!keyStr) {
this.logger.warn('No hotkey specified');
throw new Error("No hotkey specified");
}
this.logger.info(`Executing hotkey: ${keyStr}`);
try {
await (await this.getHotkeyExecutor()).press(page, keyStr);
} catch (error) {
this.logger.error('Hotkey execution failed:', error);
}
}
async handlePress(inputs) {
const page = await this.getActivePage();
const keyStr = null == inputs ? void 0 : inputs.key;
if (!keyStr) {
this.logger.warn('No key specified for press');
throw new Error("No key specified for press");
}
this.logger.info(`Pressing key: ${keyStr}`);
try {
await (await this.getHotkeyExecutor()).down(page, keyStr);
} catch (error) {
this.logger.error('Press execution failed:', error);
}
this.logger.info('Press operation completed');
}
async handleRelease(inputs) {
const page = await this.getActivePage();
const keyStr = null == inputs ? void 0 : inputs.key;
if (!keyStr) {
this.logger.warn('No key specified for release');
throw new Error("No key specified for release");
}
this.logger.info(`Releasing key: ${keyStr}`);
try {
await (await this.getHotkeyExecutor()).up(page, keyStr);
} catch (error) {
this.logger.error('Release execution failed:', error);
}
this.logger.info('Release operation completed');
}
async handleScroll(inputs) {
const page = await this.getActivePage();
const direction = inputs.direction.toLowerCase();
if (!inputs.point) throw new Error("No point specified for scroll");
const { realX: startX, realY: startY } = await this.calculateRealCoords(inputs.point);
if (startX && startY) {
this.logger.info(`Moving mouse to scroll position: (${startX}, ${startY})`);
await page.mouse.move(startX, startY);
await (0, utils_namespaceObject.sleep)(100);
}
const { screenWidth, screenHeight, scaleX, scaleY } = await this.getScreenContext();
const scrollAmount = 'up' === direction || 'down' === direction ? screenHeight / scaleY * 0.8 : screenWidth / scaleX * 0.8;
this.logger.info(`Scrolling ${direction} by ${scrollAmount}px`);
switch(direction){
case 'up':
await page.mouse.wheel({
deltaY: -scrollAmount
});
break;
case 'down':
await page.mouse.wheel({
deltaY: scrollAmount
});
break;
case 'left':
await page.mouse.wheel({
deltaX: -scrollAmount
});
break;
case 'right':
await page.mouse.wheel({
deltaX: scrollAmount
});
break;
default:
this.logger.warn(`Unsupported scroll direction: ${direction}`);
return;
}
this.logger.info('Scroll completed');
}
async handleNavigate(inputs) {
if (!inputs.url) throw new Error('No target url specified for navigation');
let { url } = inputs;
if (!/^https?:\/\//i.test(url)) url = 'https://' + url;
this.logger.info(`Navigating to: ${url}`);
const page = await this.getActivePage();
await page.goto(url, {
waitUntil: []
});
this.logger.info('Navigation completed');
}
async handleDrag(inputs) {
if (!inputs.start || !inputs.end) throw new Error('Missing start_point or end_point for drag operation');
const { realX: startX, realY: startY } = await this.calculateRealCoords(inputs.start);
const { realX: endX, realY: endY } = await this.calculateRealCoords(inputs.end);
if (!startX || !startY || !endX || !endY) throw new Error('Invalid coordinates for drag operation');
try {
var _this_uiHelper;
const page = await this.getActivePage();
await (null == (_this_uiHelper = this.uiHelper) ? void 0 : _this_uiHelper.showDragIndicator(startX, startY, endX, endY));
await (0, utils_namespaceObject.sleep)(300);
await page.mouse.move(startX, startY);
await (0, utils_namespaceObject.sleep)(100);
await page.mouse.down();
const steps = 10;
for(let i = 1; i <= steps; i++){
const stepX = startX + (endX - startX) * i / steps;
const stepY = startY + (endY - startY) * i / steps;
await page.mouse.move(stepX, stepY);
await (0, utils_namespaceObject.sleep)(30);
}
await (0, utils_namespaceObject.sleep)(100);
await page.mouse.up();
await (0, utils_namespaceObject.sleep)(800);
this.logger.info('Drag completed');
} catch (error) {
this.logger.error('Drag operation failed:', error);
throw error;
}
}
async handleNavigateBack() {
const page = await this.getActivePage();
this.logger.info("handleNavigateBack");
await page.goBack();
this.logger.info('handleNavigateBack completed');
}
async waitForPossibleNavigation(page) {
const navigationPromise = new Promise((resolve)=>{
const onStarted = ()=>{
this.logger.info('Navigation started');
resolve();
page.off('framenavigated', onStarted);
};
page.on('framenavigated', onStarted);
setTimeout(()=>{
page.off('framenavigated', onStarted);
resolve();
}, 5000);
});
await navigationPromise;
this.logger.info('Navigation completed or timed out');
}
async getScreenRect() {
var _page_viewport, _page_viewport1;
const page = await this.getActivePage();
const width = null == (_page_viewport = page.viewport()) ? void 0 : _page_viewport.width;
const height = null == (_page_viewport1 = page.viewport()) ? void 0 : _page_viewport1.height;
if (!width || !height) throw Error('Get screen context failed.');
this.logger.debug('getScreenRect: w, h: ', `(${width} x ${height})`);
return {
width,
height
};
}
async getDeviceScaleFactor() {
var _page_viewport;
if (this.deviceScaleFactor) return this.deviceScaleFactor;
const page = await this.getActivePage();
const scaleFactor = null == (_page_viewport = page.viewport()) ? void 0 : _page_viewport.deviceScaleFactor;
if (scaleFactor) {
this.deviceScaleFactor = scaleFactor;
this.logger.debug('getDeviceScaleFactor: deviceScaleFactor: ', scaleFactor);
return scaleFactor;
}
const devicePixelRatio = await page.evaluate(()=>window.devicePixelRatio);
if (devicePixelRatio) {
this.deviceScaleFactor = devicePixelRatio;
this.logger.debug('getDeviceScaleFactor: devicePixelRatio: ', devicePixelRatio);
return devicePixelRatio;
}
throw Error('Get deviceScaleFactor failed.');
}
async getActivePage() {
const pages = await this.browser.getBrowser().pages();
this.logger.info(`get active pages len: ${pages.length}`);
for (const page of pages)try {
const visibilityState = await Promise.race([
page.evaluate(()=>document.visibilityState),
new Promise((_, reject)=>{
setTimeout(()=>reject(new Error('Visibility check timed out after 3s')), 3000);
})
]);
if ('visible' === visibilityState) {
this.logger.success('Active visible page retrieved successfully (direct check)');
return page;
}
} catch (evalError) {
this.logger.warn('Warning: checking page visibility directly:', evalError);
continue;
}
for (const page of pages)try {
const isVisible = await page.waitForFunction(()=>'visible' === document.visibilityState, {
timeout: 3000
});
if (isVisible) {
this.logger.success('Active visible page retrieved successfully');
return page;
}
} catch (waitError) {
this.logger.warn(`Visibility check timed out for page: ${page.url()}`);
continue;
}
this.logger.success('Active original page retrieved failed, fallback to active page');
return this.browser.getActivePage();
}
async getHotkeyExecutor() {
if (this.hotkeyExecutor) return this.hotkeyExecutor;
const pptrBrowser = (await this.getActivePage()).browser();
const envInfo = await (0, puppeteer_enhance_namespaceObject.getEnvInfo)(pptrBrowser);
this.hotkeyExecutor = new puppeteer_enhance_namespaceObject.Hotkey({
osName: envInfo.osName,
browserName: envInfo.browserName
});
return this.hotkeyExecutor;
}
async getMeta() {
try {
const page = await this.getActivePage();
return {
url: page.url()
};
} catch (error) {
this.logger.error('Failed to get page meta:', error);
}
return {
url: ''
};
}
async calculateRealCoords(coords) {
if (!coords.normalized) {
if (!coords.raw) throw new Error('Invalide coordinates');
return {
realX: coords.raw.x,
realY: coords.raw.y
};
}
const screenContext = await this.getScreenContext();
return {
realX: coords.normalized.x * screenContext.screenWidth,
realY: coords.normalized.y * screenContext.screenHeight
};
}
constructor(options){
super(), _define_property(this, "options", void 0), _define_property(this, "logger", void 0), _define_property(this, "browser", void 0), _define_property(this, "uiHelper", void 0), _define_property(this, "showActionInfo", void 0), _define_property(this, "showWaterFlowEffect", void 0), _define_property(this, "highlightClickableElements", void 0), _define_property(this, "deviceScaleFactor", void 0), _define_property(this, "currentScreenContext", void 0), _define_property(this, "currentPage", void 0), _define_property(this, "hotkeyExecutor", void 0), this.options = options, this.showActionInfo = true, this.showWaterFlowEffect = true, this.highlightClickableElements = true, this.currentPage = null;
this.browser = this.options.browser;
this.logger = (this.options.logger ?? logger_namespaceObject.defaultLogger).spawn('[BrowserOperator]');
this.uiHelper = new external_ui_helper_js_namespaceObject.UIHelper(()=>this.getActivePage(), this.logger);
if (false === options.showActionInfo) this.showActionInfo = false;
if (false === options.showWaterFlow) this.showWaterFlowEffect = false;
if (false === options.highlightClickableElements) this.highlightClickableElements = false;
}
}
exports.BrowserOperator = __webpack_exports__.BrowserOperator;
for(var __webpack_i__ in __webpack_exports__)if (-1 === [
"BrowserOperator"
].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
Object.defineProperty(exports, '__esModule', {
value: true
});
//# sourceMappingURL=browser-operator.js.map