@gui-agent/operator-aio
Version:
AIO (All-in-One) operator for GUI Agent
307 lines (306 loc) • 14.2 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
;
var __webpack_require__ = {};
(()=>{
__webpack_require__.d = (exports1, definition)=>{
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
enumerable: true,
get: definition[key]
});
};
})();
(()=>{
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
})();
(()=>{
__webpack_require__.r = (exports1)=>{
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
value: 'Module'
});
Object.defineProperty(exports1, '__esModule', {
value: true
});
};
})();
var __webpack_exports__ = {};
__webpack_require__.r(__webpack_exports__);
__webpack_require__.d(__webpack_exports__, {
AIOHybridOperator: ()=>AIOHybridOperator
});
const core_namespaceObject = require("@ui-tars/sdk/core");
const logger_namespaceObject = require("@agent-infra/logger");
const media_utils_namespaceObject = require("@agent-infra/media-utils");
const utils_namespaceObject = require("@ui-tars/shared/utils");
const external_utils_js_namespaceObject = require("./utils.js");
const external_AIOComputer_js_namespaceObject = require("./AIOComputer.js");
const external_AIOBrowser_js_namespaceObject = require("./AIOBrowser.js");
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
const logger = new logger_namespaceObject.ConsoleLogger('AioHybridOperator');
class AIOHybridOperator extends core_namespaceObject.Operator {
static async create(options) {
logger.info('[AioHybridOperator] construct:', options.baseURL);
const instance = new AIOHybridOperator(options);
await instance.initialize(options);
this.currentInstance = instance;
return instance;
}
async initialize(options) {
var _this_aioBrowser;
this.aioComputer.screenshot(0);
this.aioBrowser = await external_AIOBrowser_js_namespaceObject.AIOBrowser.create({
baseURl: options.baseURL,
logger: logger
});
await (null == (_this_aioBrowser = this.aioBrowser) ? void 0 : _this_aioBrowser.launch({
timeout: 1000,
defaultViewport: {
width: 1280,
height: 1024
}
}));
logger.info('[AioHybridOperator] AIOBrowser launched successfully');
logger.info('[AioHybridOperator] AIOBrowser initialized successfully');
}
async getMeta() {
let url = '';
try {
var _this_aioBrowser;
const retUrl = await (null == (_this_aioBrowser = this.aioBrowser) ? void 0 : _this_aioBrowser.getActiveUrl());
if (retUrl) url = retUrl;
} catch (error) {
logger.error('Failed to get page meta:', error);
}
return {
url
};
}
async screenshot() {
logger.info('[AioHybridOperator] Taking screenshot');
try {
var _result_data;
const result = await this.aioComputer.screenshot();
if (!result.success) throw new Error(result.message || 'Screenshot failed');
if (null == (_result_data = result.data) ? void 0 : _result_data.base64) {
var _result_data1;
const base64Tool = new media_utils_namespaceObject.Base64ImageParser(null == (_result_data1 = result.data) ? void 0 : _result_data1.base64);
const dimensions = base64Tool.getDimensions();
if (dimensions) {
this.screenshotWidth = null == dimensions ? void 0 : dimensions.width;
this.screenshotHeight = null == dimensions ? void 0 : dimensions.height;
}
logger.info('[AioHybridOperator] screenshot dimensions:', JSON.stringify(dimensions));
return {
base64: result.data.base64,
scaleFactor: result.data.scaleFactor || 1
};
}
throw new Error('No base64 image data received from screenshot API');
} catch (error) {
logger.error('[AioHybridOperator] Screenshot failed:', error);
throw error;
}
}
async execute(params) {
const { parsedPrediction, screenWidth, screenHeight, scaleFactor } = params;
const { action_type, action_inputs } = parsedPrediction;
const startBoxStr = (null == action_inputs ? void 0 : action_inputs.start_box) || '';
logger.info('[AioHybridOperator] Executing action', action_type, action_inputs, ', screen context', this.screenshotWidth, this.screenshotHeight);
const { x: rawX, y: rawY, percentX: rawPercentX, percentY: rawPercentY } = (0, external_utils_js_namespaceObject.parseBoxToScreenCoords)({
boxStr: startBoxStr,
screenWidth: this.screenshotWidth,
screenHeight: this.screenshotHeight,
factors: [
1000,
1000
]
});
const startX = null !== rawX ? Math.round(rawX) : null;
const startY = null !== rawY ? Math.round(rawY) : null;
logger.info(`[AioHybridOperator] Action position: (${startX}, ${startY})`);
logger.info(`[AioHybridOperator] Action position percent raw: (${rawPercentX}, ${rawPercentY})`);
let startXPercent = null, startYPercent = null;
try {
switch(action_type){
case 'navigate':
var _this_aioBrowser;
logger.info('[AioHybridOperator] Navigating to', null == action_inputs ? void 0 : action_inputs.content);
await (null == (_this_aioBrowser = this.aioBrowser) ? void 0 : _this_aioBrowser.handleNavigate({
url: (null == action_inputs ? void 0 : action_inputs.content) || ''
}));
break;
case 'navigate_back':
var _this_aioBrowser1;
logger.info('[AioHybridOperator] Navigating back');
await (null == (_this_aioBrowser1 = this.aioBrowser) ? void 0 : _this_aioBrowser1.handleNavigateBack());
break;
case 'wait':
logger.info('[AioHybridOperator] Waiting for 5 seconds');
await (0, utils_namespaceObject.sleep)(5000);
break;
case 'mouse_move':
case 'hover':
if (null !== startX && null !== startY) {
await this.aioComputer.moveTo(startX, startY);
startXPercent = rawPercentX;
startYPercent = rawPercentY;
}
break;
case 'click':
case 'left_click':
case 'left_single':
if (null !== startX && null !== startY) {
await this.aioComputer.click(startX, startY);
startXPercent = rawPercentX;
startYPercent = rawPercentY;
}
break;
case 'left_double':
case 'double_click':
if (null !== startX && null !== startY) {
await this.aioComputer.doubleClick(startX, startY);
startXPercent = rawPercentX;
startYPercent = rawPercentY;
}
break;
case 'right_click':
case 'right_single':
if (null !== startX && null !== startY) {
await this.aioComputer.rightClick(startX, startY);
startXPercent = rawPercentX;
startYPercent = rawPercentY;
}
break;
case 'middle_click':
if (null !== startX && null !== startY) {
await this.aioComputer.click(startX, startY, 'middle');
startXPercent = rawPercentX;
startYPercent = rawPercentY;
}
break;
case 'left_click_drag':
case 'drag':
case 'select':
if (null == action_inputs ? void 0 : action_inputs.end_box) {
const { x: rawEndX, y: rawEndY } = (0, external_utils_js_namespaceObject.parseBoxToScreenCoords)({
boxStr: action_inputs.end_box,
screenWidth,
screenHeight
});
const endX = null !== rawEndX ? Math.round(rawEndX) : null;
const endY = null !== rawEndY ? Math.round(rawEndY) : null;
if (startX && startY && endX && endY) {
await this.aioComputer.moveTo(startX, startY);
await this.aioComputer.mouseDown();
await this.aioComputer.dragTo(endX, endY);
await this.aioComputer.mouseUp();
}
}
break;
case 'type':
{
var _action_inputs_content;
const content = null == (_action_inputs_content = action_inputs.content) ? void 0 : _action_inputs_content.trim();
if (content) {
const stripContent = content.replace(/\\n$/, '').replace(/\n$/, '');
await this.aioComputer.type(stripContent);
}
break;
}
case 'hotkey':
case 'press':
{
const keyStr = (null == action_inputs ? void 0 : action_inputs.key) || (null == action_inputs ? void 0 : action_inputs.hotkey);
if (keyStr) {
const keys = keyStr.split(/[\s+]/).filter((k)=>k.length > 0);
if (keys.length > 1) await this.aioComputer.hotkey(keys);
else await this.aioComputer.press(keyStr);
}
break;
}
case 'scroll':
{
const { direction } = action_inputs;
if (null !== startX && null !== startY && direction) {
const normalizedDirection = direction.toLowerCase();
let dx = 0, dy = 0;
switch(normalizedDirection){
case 'up':
dy = 10;
break;
case 'down':
dy = -10;
break;
case 'left':
dx = 10;
break;
case 'right':
dx = -10;
break;
}
if (0 !== dx || 0 !== dy) await this.aioComputer.scroll(dx, dy);
}
break;
}
case 'error_env':
case 'call_user':
case 'finished':
case 'user_stop':
break;
default:
logger.warn(`Unsupported action type: ${action_type}`);
}
logger.info(`[AioHybridOperator] position percent return: (${startXPercent}, ${startYPercent})`);
return {
startX,
startY,
startXPercent,
startYPercent,
action_inputs
};
} catch (error) {
logger.error("[AioHybridOperator] \u6267\u884C\u5931\u8D25:", error);
return {
status: core_namespaceObject.StatusEnum.ERROR
};
}
}
constructor(options){
super(), _define_property(this, "aioBrowser", null), _define_property(this, "aioComputer", void 0), _define_property(this, "screenshotWidth", 1280), _define_property(this, "screenshotHeight", 1024);
this.aioComputer = new external_AIOComputer_js_namespaceObject.AIOComputer(options);
}
}
_define_property(AIOHybridOperator, "MANUAL", {
ACTION_SPACES: [
"click(start_box='[x1, y1, x2, y2]')",
"left_double(start_box='[x1, y1, x2, y2]')",
"right_single(start_box='[x1, y1, x2, y2]')",
"drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')",
"hotkey(key='')",
"type(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.",
"scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')",
"wait() #Sleep for 5s and take a screenshot to check for any changes.",
"finished()",
"call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help."
]
});
_define_property(AIOHybridOperator, "currentInstance", null);
exports.AIOHybridOperator = __webpack_exports__.AIOHybridOperator;
for(var __webpack_i__ in __webpack_exports__)if (-1 === [
"AIOHybridOperator"
].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
Object.defineProperty(exports, '__esModule', {
value: true
});
//# sourceMappingURL=AIOHybridOperator.js.map