@ui-tars/operator-nut-js
Version:
Operator Nut JS SDK for UI-TARS
240 lines (239 loc) • 11 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { Operator, StatusEnum, parseBoxToScreenCoords, useContext } from "@ui-tars/sdk/core";
import { Jimp } from "jimp";
import { Button, Key, Point, Region, centerOf, clipboard, keyboard, mouse, screen, sleep, straightTo } from "@computer-use/nut-js";
import big from "big.js";
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
const moveStraightTo = async (startX, startY)=>{
if (null === startX || null === startY) return;
await mouse.move(straightTo(new Point(startX, startY)));
};
class NutJSOperator extends Operator {
async screenshot() {
const { logger } = useContext();
const grabImage = await screen.grab();
const screenWithScale = await grabImage.toRGB();
const scaleFactor = screenWithScale.pixelDensity.scaleX;
logger.info('[NutjsOperator]', 'scaleX', screenWithScale.pixelDensity.scaleX, 'scaleY', screenWithScale.pixelDensity.scaleY);
const screenWithScaleImage = await Jimp.fromBitmap({
width: screenWithScale.width,
height: screenWithScale.height,
data: Buffer.from(screenWithScale.data)
});
const width = screenWithScale.width / screenWithScale.pixelDensity.scaleX;
const height = screenWithScale.height / screenWithScale.pixelDensity.scaleY;
const physicalScreenImage = await screenWithScaleImage.resize({
w: width,
h: height
}).getBuffer('image/png');
const output = {
base64: physicalScreenImage.toString('base64'),
scaleFactor
};
null == logger || logger.info(`[NutjsOperator] screenshot: ${width}x${height}, scaleFactor: ${scaleFactor}`);
return output;
}
async execute(params) {
const { logger } = useContext();
const { parsedPrediction, screenWidth, screenHeight, scaleFactor } = params;
const { action_type, action_inputs } = parsedPrediction;
const startBoxStr = (null == action_inputs ? void 0 : action_inputs.start_box) || '';
logger.info('[NutjsOperator] execute', scaleFactor);
const { x: startX, y: startY } = parseBoxToScreenCoords({
boxStr: startBoxStr,
screenWidth,
screenHeight
});
logger.info(`[NutjsOperator Position]: (${startX}, ${startY})`);
mouse.config.mouseSpeed = 3600;
const getHotkeys = (keyStr)=>{
if (keyStr) {
const platformCommandKey = 'darwin' === process.platform ? Key.LeftCmd : Key.LeftWin;
const platformCtrlKey = 'darwin' === process.platform ? Key.LeftCmd : Key.LeftControl;
const keyMap = {
return: Key.Enter,
ctrl: platformCtrlKey,
shift: Key.LeftShift,
alt: Key.LeftAlt,
'page down': Key.PageDown,
'page up': Key.PageUp,
meta: platformCommandKey,
win: platformCommandKey,
command: platformCommandKey,
cmd: platformCommandKey,
',': Key.Comma,
arrowup: Key.Up,
arrowdown: Key.Down,
arrowleft: Key.Left,
arrowright: Key.Right
};
const lowercaseKeyMap = Object.fromEntries(Object.entries(Key).map(([k, v])=>[
k.toLowerCase(),
v
]));
const keys = keyStr.split(/[\s+]/).map((k)=>k.toLowerCase()).map((k)=>keyMap[k] ?? lowercaseKeyMap[k]).filter(Boolean);
logger.info('[NutjsOperator] hotkey: ', keys);
return keys;
}
logger.error('[NutjsOperator] hotkey error: ', `${keyStr} is not a valid key`);
return [];
};
switch(action_type){
case 'wait':
logger.info('[NutjsOperator] wait', action_inputs);
await sleep(5000);
break;
case 'mouse_move':
case 'hover':
logger.info('[NutjsOperator] mouse_move');
await moveStraightTo(startX, startY);
break;
case 'click':
case 'left_click':
case 'left_single':
logger.info('[NutjsOperator] left_click');
await moveStraightTo(startX, startY);
await sleep(100);
await mouse.click(Button.LEFT);
break;
case 'left_double':
case 'double_click':
logger.info(`[NutjsOperator] ${action_type}(${startX}, ${startY})`);
await moveStraightTo(startX, startY);
await sleep(100);
await mouse.doubleClick(Button.LEFT);
break;
case 'right_click':
case 'right_single':
logger.info('[NutjsOperator] right_click');
await moveStraightTo(startX, startY);
await sleep(100);
await mouse.click(Button.RIGHT);
break;
case 'middle_click':
logger.info('[NutjsOperator] middle_click');
await moveStraightTo(startX, startY);
await mouse.click(Button.MIDDLE);
break;
case 'left_click_drag':
case 'drag':
case 'select':
logger.info('[NutjsOperator] drag', action_inputs);
if (null == action_inputs ? void 0 : action_inputs.end_box) {
const { x: endX, y: endY } = parseBoxToScreenCoords({
boxStr: action_inputs.end_box,
screenWidth,
screenHeight
});
if (startX && startY && endX && endY) {
const diffX = big(endX).minus(startX).toNumber();
const diffY = big(endY).minus(startY).toNumber();
await mouse.drag(straightTo(centerOf(new Region(startX, startY, diffX, diffY))));
}
}
break;
case 'type':
{
var _action_inputs_content;
const content = null == (_action_inputs_content = action_inputs.content) ? void 0 : _action_inputs_content.trim();
logger.info('[NutjsOperator] type', content);
if (content) {
const stripContent = content.replace(/\\n$/, '').replace(/\n$/, '');
keyboard.config.autoDelayMs = 0;
if ('win32' === process.platform) {
const originalClipboard = await clipboard.getContent();
await clipboard.setContent(stripContent);
await keyboard.pressKey(Key.LeftControl, Key.V);
await sleep(50);
await keyboard.releaseKey(Key.LeftControl, Key.V);
await sleep(50);
await clipboard.setContent(originalClipboard);
} else await keyboard.type(stripContent);
if (content.endsWith('\n') || content.endsWith('\\n')) {
await keyboard.pressKey(Key.Enter);
await keyboard.releaseKey(Key.Enter);
}
keyboard.config.autoDelayMs = 500;
}
break;
}
case 'hotkey':
{
const keyStr = (null == action_inputs ? void 0 : action_inputs.key) || (null == action_inputs ? void 0 : action_inputs.hotkey);
const keys = getHotkeys(keyStr);
if (keys.length > 0) {
await keyboard.pressKey(...keys);
await keyboard.releaseKey(...keys);
}
break;
}
case 'press':
{
const keyStr = (null == action_inputs ? void 0 : action_inputs.key) || (null == action_inputs ? void 0 : action_inputs.hotkey);
const keys = getHotkeys(keyStr);
if (keys.length > 0) await keyboard.pressKey(...keys);
break;
}
case 'release':
{
const keyStr = (null == action_inputs ? void 0 : action_inputs.key) || (null == action_inputs ? void 0 : action_inputs.hotkey);
const keys = getHotkeys(keyStr);
if (keys.length > 0) await keyboard.releaseKey(...keys);
break;
}
case 'scroll':
{
const { direction } = action_inputs;
if (null !== startX && null !== startY) await moveStraightTo(startX, startY);
switch(null == direction ? void 0 : direction.toLowerCase()){
case 'up':
await mouse.scrollUp(500);
break;
case 'down':
await mouse.scrollDown(500);
break;
default:
console.warn(`[NutjsOperator] Unsupported scroll direction: ${direction}`);
}
break;
}
case 'error_env':
case 'call_user':
case 'finished':
case 'user_stop':
return {
status: StatusEnum.END
};
default:
logger.warn(`Unsupported action: ${action_type}`);
}
}
}
_define_property(NutJSOperator, "MANUAL", {
ACTION_SPACES: [
"click(start_box='[x1, y1, x2, y2]')",
"left_double(start_box='[x1, y1, x2, y2]')",
"right_single(start_box='[x1, y1, x2, y2]')",
"drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')",
"hotkey(key='')",
"type(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.",
"scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')",
"wait() #Sleep for 5s and take a screenshot to check for any changes.",
"finished()",
"call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help."
]
});
export { NutJSOperator };
//# sourceMappingURL=index.mjs.map