@ui-tars/sdk
Version:
A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS
387 lines (386 loc) • 19.6 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { ErrorStatusEnum, GUIAgentError, ShareVersion, StatusEnum } from "@ui-tars/shared/types";
import { IMAGE_PLACEHOLDER, MAX_LOOP_COUNT } from "@ui-tars/shared/constants";
import { sleep } from "@ui-tars/shared/utils";
import async_retry from "async-retry";
import { Jimp } from "jimp";
import { v4 } from "uuid";
import { setContext } from "./context/useContext.mjs";
import { UITarsModel } from "./Model.mjs";
import { BaseGUIAgent } from "./base/index.mjs";
import { getSummary, processVlmParams, replaceBase64Prefix, toVlmModelFormat } from "./utils.mjs";
import { INTERNAL_ACTION_SPACES_ENUM, MAX_SNAPSHOT_ERR_CNT, SYSTEM_PROMPT, SYSTEM_PROMPT_TEMPLATE } from "./constants.mjs";
import { InternalServerError } from "openai";
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
class GUIAgent extends BaseGUIAgent {
async run(instruction, historyMessages, remoteModelHdrs) {
const { operator, model, logger } = this;
const { signal, onData, onError, retry = {}, maxLoopCount = MAX_LOOP_COUNT } = this.config;
const currentTime = Date.now();
const data = {
version: ShareVersion.V1,
systemPrompt: this.systemPrompt,
instruction,
modelName: this.model.modelName,
status: StatusEnum.INIT,
logTime: currentTime,
conversations: [
{
from: 'human',
value: instruction,
timing: {
start: currentTime,
end: currentTime,
cost: 0
}
}
]
};
setContext(Object.assign(this.config, {
logger: this.logger,
systemPrompt: this.systemPrompt,
factors: this.model.factors,
model: this.model
}));
logger.info(`[GUIAgent] run:\nsystem prompt: ${this.systemPrompt},\nmodel version: ${this.uiTarsVersion},\nmodel config: ${JSON.stringify(this.model)}`);
let loopCnt = 0;
let snapshotErrCnt = 0;
let totalTokens = 0;
let totalTime = 0;
let previousResponseId;
data.status = StatusEnum.RUNNING;
await (null == onData ? void 0 : onData({
data: {
...data,
conversations: []
}
}));
const sessionId = this.generateSessionId();
try {
while(true){
var _retry_screenshot, _retry_screenshot1, _retry_model, _retry_model1;
logger.info('[GUIAgent] loopCnt:', loopCnt);
if (this.isPaused && this.resumePromise) {
data.status = StatusEnum.PAUSE;
await (null == onData ? void 0 : onData({
data: {
...data,
conversations: []
}
}));
await this.resumePromise;
data.status = StatusEnum.RUNNING;
await (null == onData ? void 0 : onData({
data: {
...data,
conversations: []
}
}));
}
if (this.isStopped || data.status !== StatusEnum.RUNNING && data.status !== StatusEnum.PAUSE || (null == signal ? void 0 : signal.aborted)) {
(null == signal ? void 0 : signal.aborted) && (data.status = StatusEnum.USER_STOPPED);
break;
}
if (loopCnt >= maxLoopCount) {
Object.assign(data, {
status: StatusEnum.ERROR,
error: this.guiAgentErrorParser(ErrorStatusEnum.REACH_MAXLOOP_ERROR)
});
break;
}
if (snapshotErrCnt >= MAX_SNAPSHOT_ERR_CNT) {
Object.assign(data, {
status: StatusEnum.ERROR,
error: this.guiAgentErrorParser(ErrorStatusEnum.SCREENSHOT_RETRY_ERROR)
});
break;
}
loopCnt += 1;
const start = Date.now();
const snapshot = await async_retry(()=>operator.screenshot(), {
retries: (null == retry ? void 0 : null == (_retry_screenshot = retry.screenshot) ? void 0 : _retry_screenshot.maxRetries) ?? 0,
minTimeout: 5000,
onRetry: null == retry ? void 0 : null == (_retry_screenshot1 = retry.screenshot) ? void 0 : _retry_screenshot1.onRetry
});
const { width, height, mime } = await Jimp.fromBuffer(Buffer.from(replaceBase64Prefix(snapshot.base64), 'base64')).catch((e)=>{
logger.error('[GUIAgent] screenshot error', e);
return {
width: null,
height: null,
mime: ''
};
});
const isValidImage = !!((null == snapshot ? void 0 : snapshot.base64) && width && height);
if (!isValidImage) {
loopCnt -= 1;
snapshotErrCnt += 1;
await sleep(1000);
continue;
}
let end = Date.now();
if (isValidImage) {
data.conversations.push({
from: 'human',
value: IMAGE_PLACEHOLDER,
screenshotBase64: snapshot.base64,
screenshotContext: {
size: {
width,
height
},
mime,
scaleFactor: snapshot.scaleFactor
},
timing: {
start,
end,
cost: end - start
}
});
await (null == onData ? void 0 : onData({
data: {
...data,
conversations: data.conversations.slice(-1)
}
}));
}
const modelFormat = toVlmModelFormat({
historyMessages: historyMessages || [],
conversations: data.conversations,
systemPrompt: data.systemPrompt
});
const vlmParams = {
...processVlmParams(modelFormat.conversations, modelFormat.images),
screenContext: {
width,
height
},
scaleFactor: snapshot.scaleFactor,
uiTarsVersion: this.uiTarsVersion,
headers: {
...remoteModelHdrs,
'X-Session-Id': sessionId
},
previousResponseId
};
const { prediction, parsedPredictions, costTime, costTokens, responseId } = await async_retry(async (bail)=>{
try {
const result = await model.invoke(vlmParams);
return result;
} catch (error) {
var _error_message;
if (error instanceof Error && ((null == error ? void 0 : error.name) === 'APIUserAbortError' || (null == error ? void 0 : null == (_error_message = error.message) ? void 0 : _error_message.includes('aborted')))) {
bail(error);
return {
prediction: '',
parsedPredictions: []
};
}
Object.assign(data, {
status: StatusEnum.ERROR,
error: this.guiAgentErrorParser(ErrorStatusEnum.INVOKE_RETRY_ERROR, error)
});
throw error;
}
}, {
retries: (null == retry ? void 0 : null == (_retry_model = retry.model) ? void 0 : _retry_model.maxRetries) ?? 0,
minTimeout: 30000,
onRetry: null == retry ? void 0 : null == (_retry_model1 = retry.model) ? void 0 : _retry_model1.onRetry
});
if (responseId) previousResponseId = responseId;
totalTokens += costTokens || 0;
totalTime += costTime || 0;
logger.info(`[GUIAgent] consumes: >>> costTime: ${costTime}, costTokens: ${costTokens} <<<`);
logger.info('[GUIAgent] Response:', prediction);
logger.info('[GUIAgent] Parsed Predictions:', JSON.stringify(parsedPredictions));
if (!prediction) {
logger.error('[GUIAgent] Response Empty:', prediction);
continue;
}
const predictionSummary = getSummary(prediction);
end = Date.now();
data.conversations.push({
from: 'gpt',
value: predictionSummary,
timing: {
start,
end,
cost: end - start
},
screenshotContext: {
size: {
width,
height
},
scaleFactor: snapshot.scaleFactor
},
predictionParsed: parsedPredictions
});
await (null == onData ? void 0 : onData({
data: {
...data,
conversations: data.conversations.slice(-1)
}
}));
for (const parsedPrediction of parsedPredictions){
const actionType = parsedPrediction.action_type;
logger.info('[GUIAgent] Action:', actionType);
if (actionType === INTERNAL_ACTION_SPACES_ENUM.ERROR_ENV) {
Object.assign(data, {
status: StatusEnum.ERROR,
error: this.guiAgentErrorParser(ErrorStatusEnum.ENVIRONMENT_ERROR)
});
break;
}
if (actionType === INTERNAL_ACTION_SPACES_ENUM.MAX_LOOP) {
Object.assign(data, {
status: StatusEnum.ERROR,
error: this.guiAgentErrorParser(ErrorStatusEnum.REACH_MAXLOOP_ERROR)
});
break;
}
if (!(null == signal ? void 0 : signal.aborted) && !this.isStopped) {
var _retry_execute, _retry_execute1;
logger.info('[GUIAgent] Action Inputs:', parsedPrediction.action_inputs, parsedPrediction.action_type);
const executeOutput = await async_retry(()=>operator.execute({
prediction,
parsedPrediction,
screenWidth: width,
screenHeight: height,
scaleFactor: snapshot.scaleFactor,
factors: this.model.factors
}), {
retries: (null == retry ? void 0 : null == (_retry_execute = retry.execute) ? void 0 : _retry_execute.maxRetries) ?? 0,
minTimeout: 5000,
onRetry: null == retry ? void 0 : null == (_retry_execute1 = retry.execute) ? void 0 : _retry_execute1.onRetry
}).catch((e)=>{
logger.error('[GUIAgent] execute error', e);
Object.assign(data, {
status: StatusEnum.ERROR,
error: this.guiAgentErrorParser(ErrorStatusEnum.EXECUTE_RETRY_ERROR, e)
});
});
if (executeOutput && (null == executeOutput ? void 0 : executeOutput.status)) data.status = executeOutput.status;
}
if (actionType === INTERNAL_ACTION_SPACES_ENUM.CALL_USER) {
data.status = StatusEnum.CALL_USER;
break;
}
if (actionType === INTERNAL_ACTION_SPACES_ENUM.FINISHED) {
data.status = StatusEnum.END;
break;
}
}
if (this.config.loopIntervalInMs && this.config.loopIntervalInMs > 0) {
logger.info(`[GUIAgent] sleep for ${this.config.loopIntervalInMs}ms before next loop`);
await sleep(this.config.loopIntervalInMs);
logger.info(`[GUIAgent] sleep for ${this.config.loopIntervalInMs}ms before next loop done`);
}
}
} catch (error) {
var _error_message;
logger.error('[GUIAgent] Catch error', error);
if (error instanceof Error && ('AbortError' === error.name || (null == (_error_message = error.message) ? void 0 : _error_message.includes('aborted')))) {
logger.info('[GUIAgent] Catch: request was aborted');
data.status = StatusEnum.USER_STOPPED;
return;
}
data.status = StatusEnum.ERROR;
data.error = this.guiAgentErrorParser(ErrorStatusEnum.UNKNOWN_ERROR, error);
} finally{
var _this_model;
logger.info('[GUIAgent] Finally: status', data.status);
null == (_this_model = this.model) || _this_model.reset();
if (data.status === StatusEnum.USER_STOPPED) await operator.execute({
prediction: '',
parsedPrediction: {
action_inputs: {},
reflection: null,
action_type: 'user_stop',
thought: ''
},
screenWidth: 0,
screenHeight: 0,
scaleFactor: 1,
factors: [
0,
0
]
});
await (null == onData ? void 0 : onData({
data: {
...data,
conversations: []
}
}));
if (data.status === StatusEnum.ERROR) null == onError || onError({
data,
error: data.error || new GUIAgentError(ErrorStatusEnum.UNKNOWN_ERROR, 'Unknown error occurred')
});
logger.info(`[GUIAgent] >>> totalTokens: ${totalTokens}, totalTime: ${totalTime}, loopCnt: ${loopCnt} <<<`);
}
}
pause() {
this.isPaused = true;
this.resumePromise = new Promise((resolve)=>{
this.resolveResume = resolve;
});
}
resume() {
if (this.resolveResume) {
this.resolveResume();
this.resumePromise = null;
this.resolveResume = null;
}
this.isPaused = false;
}
stop() {
this.isStopped = true;
}
buildSystemPrompt() {
var _this_operator_constructor_MANUAL, _this_operator_constructor;
const actionSpaces = null == (_this_operator_constructor = this.operator.constructor) ? void 0 : null == (_this_operator_constructor_MANUAL = _this_operator_constructor.MANUAL) ? void 0 : _this_operator_constructor_MANUAL.ACTION_SPACES;
return null == actionSpaces || 0 === actionSpaces.length ? SYSTEM_PROMPT : SYSTEM_PROMPT_TEMPLATE.replace('{{action_spaces_holder}}', actionSpaces.join('\n'));
}
guiAgentErrorParser(type, error) {
this.logger.error('[GUIAgent] guiAgentErrorParser:', error);
let parseError = null;
if (error instanceof InternalServerError) {
this.logger.error('[GUIAgent] guiAgentErrorParser instanceof InternalServerError.');
parseError = new GUIAgentError(ErrorStatusEnum.MODEL_SERVICE_ERROR, error.message, error.stack);
}
if (!parseError && type === ErrorStatusEnum.REACH_MAXLOOP_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.REACH_MAXLOOP_ERROR, `Has reached max loop count: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack);
if (!parseError && type === ErrorStatusEnum.SCREENSHOT_RETRY_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.SCREENSHOT_RETRY_ERROR, `Too many screenshot failures: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack);
if (!parseError && type === ErrorStatusEnum.INVOKE_RETRY_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.INVOKE_RETRY_ERROR, `Too many model invoke failures: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack);
if (!parseError && type === ErrorStatusEnum.EXECUTE_RETRY_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.EXECUTE_RETRY_ERROR, `Too many action execute failures: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack);
if (!parseError && type === ErrorStatusEnum.ENVIRONMENT_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.ENVIRONMENT_ERROR, `The environment error occurred when parsing the action: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack);
if (!parseError) parseError = new GUIAgentError(ErrorStatusEnum.UNKNOWN_ERROR, error instanceof Error ? error.message : 'Unknown error occurred', error instanceof Error ? error.stack || 'null' : 'null');
if (!parseError.stack) Error.captureStackTrace(parseError, this.guiAgentErrorParser);
return parseError;
}
generateSessionId() {
return v4();
}
constructor(config){
super(config), _define_property(this, "operator", void 0), _define_property(this, "model", void 0), _define_property(this, "logger", void 0), _define_property(this, "uiTarsVersion", void 0), _define_property(this, "systemPrompt", void 0), _define_property(this, "isPaused", false), _define_property(this, "resumePromise", null), _define_property(this, "resolveResume", null), _define_property(this, "isStopped", false);
this.operator = config.operator;
this.model = config.model instanceof UITarsModel ? config.model : new UITarsModel(config.model);
this.logger = config.logger || console;
this.uiTarsVersion = config.uiTarsVersion;
this.systemPrompt = config.systemPrompt || this.buildSystemPrompt();
}
}
export { GUIAgent };
//# sourceMappingURL=GUIAgent.mjs.map