UNPKG

@ui-tars/sdk

Version:

A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS

387 lines (386 loc) 19.6 kB
/** * Copyright (c) 2025 Bytedance, Inc. and its affiliates. * SPDX-License-Identifier: Apache-2.0 */ import { ErrorStatusEnum, GUIAgentError, ShareVersion, StatusEnum } from "@ui-tars/shared/types"; import { IMAGE_PLACEHOLDER, MAX_LOOP_COUNT } from "@ui-tars/shared/constants"; import { sleep } from "@ui-tars/shared/utils"; import async_retry from "async-retry"; import { Jimp } from "jimp"; import { v4 } from "uuid"; import { setContext } from "./context/useContext.mjs"; import { UITarsModel } from "./Model.mjs"; import { BaseGUIAgent } from "./base/index.mjs"; import { getSummary, processVlmParams, replaceBase64Prefix, toVlmModelFormat } from "./utils.mjs"; import { INTERNAL_ACTION_SPACES_ENUM, MAX_SNAPSHOT_ERR_CNT, SYSTEM_PROMPT, SYSTEM_PROMPT_TEMPLATE } from "./constants.mjs"; import { InternalServerError } from "openai"; function _define_property(obj, key, value) { if (key in obj) Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); else obj[key] = value; return obj; } class GUIAgent extends BaseGUIAgent { async run(instruction, historyMessages, remoteModelHdrs) { const { operator, model, logger } = this; const { signal, onData, onError, retry = {}, maxLoopCount = MAX_LOOP_COUNT } = this.config; const currentTime = Date.now(); const data = { version: ShareVersion.V1, systemPrompt: this.systemPrompt, instruction, modelName: this.model.modelName, status: StatusEnum.INIT, logTime: currentTime, conversations: [ { from: 'human', value: instruction, timing: { start: currentTime, end: currentTime, cost: 0 } } ] }; setContext(Object.assign(this.config, { logger: this.logger, systemPrompt: this.systemPrompt, factors: this.model.factors, model: this.model })); logger.info(`[GUIAgent] run:\nsystem prompt: ${this.systemPrompt},\nmodel version: ${this.uiTarsVersion},\nmodel config: ${JSON.stringify(this.model)}`); let loopCnt = 0; let snapshotErrCnt = 0; let totalTokens = 0; let totalTime = 0; let previousResponseId; data.status = StatusEnum.RUNNING; await (null == onData ? void 0 : onData({ data: { ...data, conversations: [] } })); const sessionId = this.generateSessionId(); try { while(true){ var _retry_screenshot, _retry_screenshot1, _retry_model, _retry_model1; logger.info('[GUIAgent] loopCnt:', loopCnt); if (this.isPaused && this.resumePromise) { data.status = StatusEnum.PAUSE; await (null == onData ? void 0 : onData({ data: { ...data, conversations: [] } })); await this.resumePromise; data.status = StatusEnum.RUNNING; await (null == onData ? void 0 : onData({ data: { ...data, conversations: [] } })); } if (this.isStopped || data.status !== StatusEnum.RUNNING && data.status !== StatusEnum.PAUSE || (null == signal ? void 0 : signal.aborted)) { (null == signal ? void 0 : signal.aborted) && (data.status = StatusEnum.USER_STOPPED); break; } if (loopCnt >= maxLoopCount) { Object.assign(data, { status: StatusEnum.ERROR, error: this.guiAgentErrorParser(ErrorStatusEnum.REACH_MAXLOOP_ERROR) }); break; } if (snapshotErrCnt >= MAX_SNAPSHOT_ERR_CNT) { Object.assign(data, { status: StatusEnum.ERROR, error: this.guiAgentErrorParser(ErrorStatusEnum.SCREENSHOT_RETRY_ERROR) }); break; } loopCnt += 1; const start = Date.now(); const snapshot = await async_retry(()=>operator.screenshot(), { retries: (null == retry ? void 0 : null == (_retry_screenshot = retry.screenshot) ? void 0 : _retry_screenshot.maxRetries) ?? 0, minTimeout: 5000, onRetry: null == retry ? void 0 : null == (_retry_screenshot1 = retry.screenshot) ? void 0 : _retry_screenshot1.onRetry }); const { width, height, mime } = await Jimp.fromBuffer(Buffer.from(replaceBase64Prefix(snapshot.base64), 'base64')).catch((e)=>{ logger.error('[GUIAgent] screenshot error', e); return { width: null, height: null, mime: '' }; }); const isValidImage = !!((null == snapshot ? void 0 : snapshot.base64) && width && height); if (!isValidImage) { loopCnt -= 1; snapshotErrCnt += 1; await sleep(1000); continue; } let end = Date.now(); if (isValidImage) { data.conversations.push({ from: 'human', value: IMAGE_PLACEHOLDER, screenshotBase64: snapshot.base64, screenshotContext: { size: { width, height }, mime, scaleFactor: snapshot.scaleFactor }, timing: { start, end, cost: end - start } }); await (null == onData ? void 0 : onData({ data: { ...data, conversations: data.conversations.slice(-1) } })); } const modelFormat = toVlmModelFormat({ historyMessages: historyMessages || [], conversations: data.conversations, systemPrompt: data.systemPrompt }); const vlmParams = { ...processVlmParams(modelFormat.conversations, modelFormat.images), screenContext: { width, height }, scaleFactor: snapshot.scaleFactor, uiTarsVersion: this.uiTarsVersion, headers: { ...remoteModelHdrs, 'X-Session-Id': sessionId }, previousResponseId }; const { prediction, parsedPredictions, costTime, costTokens, responseId } = await async_retry(async (bail)=>{ try { const result = await model.invoke(vlmParams); return result; } catch (error) { var _error_message; if (error instanceof Error && ((null == error ? void 0 : error.name) === 'APIUserAbortError' || (null == error ? void 0 : null == (_error_message = error.message) ? void 0 : _error_message.includes('aborted')))) { bail(error); return { prediction: '', parsedPredictions: [] }; } Object.assign(data, { status: StatusEnum.ERROR, error: this.guiAgentErrorParser(ErrorStatusEnum.INVOKE_RETRY_ERROR, error) }); throw error; } }, { retries: (null == retry ? void 0 : null == (_retry_model = retry.model) ? void 0 : _retry_model.maxRetries) ?? 0, minTimeout: 30000, onRetry: null == retry ? void 0 : null == (_retry_model1 = retry.model) ? void 0 : _retry_model1.onRetry }); if (responseId) previousResponseId = responseId; totalTokens += costTokens || 0; totalTime += costTime || 0; logger.info(`[GUIAgent] consumes: >>> costTime: ${costTime}, costTokens: ${costTokens} <<<`); logger.info('[GUIAgent] Response:', prediction); logger.info('[GUIAgent] Parsed Predictions:', JSON.stringify(parsedPredictions)); if (!prediction) { logger.error('[GUIAgent] Response Empty:', prediction); continue; } const predictionSummary = getSummary(prediction); end = Date.now(); data.conversations.push({ from: 'gpt', value: predictionSummary, timing: { start, end, cost: end - start }, screenshotContext: { size: { width, height }, scaleFactor: snapshot.scaleFactor }, predictionParsed: parsedPredictions }); await (null == onData ? void 0 : onData({ data: { ...data, conversations: data.conversations.slice(-1) } })); for (const parsedPrediction of parsedPredictions){ const actionType = parsedPrediction.action_type; logger.info('[GUIAgent] Action:', actionType); if (actionType === INTERNAL_ACTION_SPACES_ENUM.ERROR_ENV) { Object.assign(data, { status: StatusEnum.ERROR, error: this.guiAgentErrorParser(ErrorStatusEnum.ENVIRONMENT_ERROR) }); break; } if (actionType === INTERNAL_ACTION_SPACES_ENUM.MAX_LOOP) { Object.assign(data, { status: StatusEnum.ERROR, error: this.guiAgentErrorParser(ErrorStatusEnum.REACH_MAXLOOP_ERROR) }); break; } if (!(null == signal ? void 0 : signal.aborted) && !this.isStopped) { var _retry_execute, _retry_execute1; logger.info('[GUIAgent] Action Inputs:', parsedPrediction.action_inputs, parsedPrediction.action_type); const executeOutput = await async_retry(()=>operator.execute({ prediction, parsedPrediction, screenWidth: width, screenHeight: height, scaleFactor: snapshot.scaleFactor, factors: this.model.factors }), { retries: (null == retry ? void 0 : null == (_retry_execute = retry.execute) ? void 0 : _retry_execute.maxRetries) ?? 0, minTimeout: 5000, onRetry: null == retry ? void 0 : null == (_retry_execute1 = retry.execute) ? void 0 : _retry_execute1.onRetry }).catch((e)=>{ logger.error('[GUIAgent] execute error', e); Object.assign(data, { status: StatusEnum.ERROR, error: this.guiAgentErrorParser(ErrorStatusEnum.EXECUTE_RETRY_ERROR, e) }); }); if (executeOutput && (null == executeOutput ? void 0 : executeOutput.status)) data.status = executeOutput.status; } if (actionType === INTERNAL_ACTION_SPACES_ENUM.CALL_USER) { data.status = StatusEnum.CALL_USER; break; } if (actionType === INTERNAL_ACTION_SPACES_ENUM.FINISHED) { data.status = StatusEnum.END; break; } } if (this.config.loopIntervalInMs && this.config.loopIntervalInMs > 0) { logger.info(`[GUIAgent] sleep for ${this.config.loopIntervalInMs}ms before next loop`); await sleep(this.config.loopIntervalInMs); logger.info(`[GUIAgent] sleep for ${this.config.loopIntervalInMs}ms before next loop done`); } } } catch (error) { var _error_message; logger.error('[GUIAgent] Catch error', error); if (error instanceof Error && ('AbortError' === error.name || (null == (_error_message = error.message) ? void 0 : _error_message.includes('aborted')))) { logger.info('[GUIAgent] Catch: request was aborted'); data.status = StatusEnum.USER_STOPPED; return; } data.status = StatusEnum.ERROR; data.error = this.guiAgentErrorParser(ErrorStatusEnum.UNKNOWN_ERROR, error); } finally{ var _this_model; logger.info('[GUIAgent] Finally: status', data.status); null == (_this_model = this.model) || _this_model.reset(); if (data.status === StatusEnum.USER_STOPPED) await operator.execute({ prediction: '', parsedPrediction: { action_inputs: {}, reflection: null, action_type: 'user_stop', thought: '' }, screenWidth: 0, screenHeight: 0, scaleFactor: 1, factors: [ 0, 0 ] }); await (null == onData ? void 0 : onData({ data: { ...data, conversations: [] } })); if (data.status === StatusEnum.ERROR) null == onError || onError({ data, error: data.error || new GUIAgentError(ErrorStatusEnum.UNKNOWN_ERROR, 'Unknown error occurred') }); logger.info(`[GUIAgent] >>> totalTokens: ${totalTokens}, totalTime: ${totalTime}, loopCnt: ${loopCnt} <<<`); } } pause() { this.isPaused = true; this.resumePromise = new Promise((resolve)=>{ this.resolveResume = resolve; }); } resume() { if (this.resolveResume) { this.resolveResume(); this.resumePromise = null; this.resolveResume = null; } this.isPaused = false; } stop() { this.isStopped = true; } buildSystemPrompt() { var _this_operator_constructor_MANUAL, _this_operator_constructor; const actionSpaces = null == (_this_operator_constructor = this.operator.constructor) ? void 0 : null == (_this_operator_constructor_MANUAL = _this_operator_constructor.MANUAL) ? void 0 : _this_operator_constructor_MANUAL.ACTION_SPACES; return null == actionSpaces || 0 === actionSpaces.length ? SYSTEM_PROMPT : SYSTEM_PROMPT_TEMPLATE.replace('{{action_spaces_holder}}', actionSpaces.join('\n')); } guiAgentErrorParser(type, error) { this.logger.error('[GUIAgent] guiAgentErrorParser:', error); let parseError = null; if (error instanceof InternalServerError) { this.logger.error('[GUIAgent] guiAgentErrorParser instanceof InternalServerError.'); parseError = new GUIAgentError(ErrorStatusEnum.MODEL_SERVICE_ERROR, error.message, error.stack); } if (!parseError && type === ErrorStatusEnum.REACH_MAXLOOP_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.REACH_MAXLOOP_ERROR, `Has reached max loop count: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack); if (!parseError && type === ErrorStatusEnum.SCREENSHOT_RETRY_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.SCREENSHOT_RETRY_ERROR, `Too many screenshot failures: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack); if (!parseError && type === ErrorStatusEnum.INVOKE_RETRY_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.INVOKE_RETRY_ERROR, `Too many model invoke failures: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack); if (!parseError && type === ErrorStatusEnum.EXECUTE_RETRY_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.EXECUTE_RETRY_ERROR, `Too many action execute failures: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack); if (!parseError && type === ErrorStatusEnum.ENVIRONMENT_ERROR) parseError = new GUIAgentError(ErrorStatusEnum.ENVIRONMENT_ERROR, `The environment error occurred when parsing the action: ${(null == error ? void 0 : error.message) || ''}`, null == error ? void 0 : error.stack); if (!parseError) parseError = new GUIAgentError(ErrorStatusEnum.UNKNOWN_ERROR, error instanceof Error ? error.message : 'Unknown error occurred', error instanceof Error ? error.stack || 'null' : 'null'); if (!parseError.stack) Error.captureStackTrace(parseError, this.guiAgentErrorParser); return parseError; } generateSessionId() { return v4(); } constructor(config){ super(config), _define_property(this, "operator", void 0), _define_property(this, "model", void 0), _define_property(this, "logger", void 0), _define_property(this, "uiTarsVersion", void 0), _define_property(this, "systemPrompt", void 0), _define_property(this, "isPaused", false), _define_property(this, "resumePromise", null), _define_property(this, "resolveResume", null), _define_property(this, "isStopped", false); this.operator = config.operator; this.model = config.model instanceof UITarsModel ? config.model : new UITarsModel(config.model); this.logger = config.logger || console; this.uiTarsVersion = config.uiTarsVersion; this.systemPrompt = config.systemPrompt || this.buildSystemPrompt(); } } export { GUIAgent }; //# sourceMappingURL=GUIAgent.mjs.map