@ui-tars/sdk
Version:
A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS
250 lines (249 loc) • 11.5 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
;
var __webpack_require__ = {};
(()=>{
__webpack_require__.n = (module)=>{
var getter = module && module.__esModule ? ()=>module['default'] : ()=>module;
__webpack_require__.d(getter, {
a: getter
});
return getter;
};
})();
(()=>{
__webpack_require__.d = (exports1, definition)=>{
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
enumerable: true,
get: definition[key]
});
};
})();
(()=>{
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
})();
(()=>{
__webpack_require__.r = (exports1)=>{
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
value: 'Module'
});
Object.defineProperty(exports1, '__esModule', {
value: true
});
};
})();
var __webpack_exports__ = {};
__webpack_require__.r(__webpack_exports__);
__webpack_require__.d(__webpack_exports__, {
UITarsModel: ()=>UITarsModel
});
const external_openai_namespaceObject = require("openai");
var external_openai_default = /*#__PURE__*/ __webpack_require__.n(external_openai_namespaceObject);
const action_parser_namespaceObject = require("@ui-tars/action-parser");
const useContext_js_namespaceObject = require("./context/useContext.js");
const external_types_js_namespaceObject = require("./types.js");
const external_utils_js_namespaceObject = require("./utils.js");
const external_constants_js_namespaceObject = require("./constants.js");
const types_namespaceObject = require("@ui-tars/shared/types");
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
class UITarsModel extends external_types_js_namespaceObject.Model {
get useResponsesApi() {
return this.modelConfig.useResponsesApi ?? false;
}
get factors() {
return external_constants_js_namespaceObject.DEFAULT_FACTORS;
}
get modelName() {
return this.modelConfig.model ?? 'unknown';
}
reset() {
this.headImageContext = null;
}
async invokeModelProvider(uiTarsVersion = types_namespaceObject.UITarsModelVersion.V1_0, params, options, headers) {
var _result_choices__message, _result_choices_, _result_choices, _result_usage;
const { logger } = (0, useContext_js_namespaceObject.useContext)();
const { messages, previousResponseId } = params;
const { baseURL, apiKey, model, max_tokens = uiTarsVersion == types_namespaceObject.UITarsModelVersion.V1_5 ? 65535 : 1000, temperature = 0, top_p = 0.7, ...restOptions } = this.modelConfig;
const openai = new (external_openai_default())({
...restOptions,
maxRetries: 0,
baseURL,
apiKey
});
const createCompletionPrams = {
model,
messages,
stream: false,
seed: null,
stop: null,
frequency_penalty: null,
presence_penalty: null,
max_tokens,
temperature,
top_p
};
const createCompletionPramsThinkingVp = {
...createCompletionPrams,
thinking: {
type: 'disabled'
}
};
const startTime = Date.now();
if (this.modelConfig.useResponsesApi) {
var _this_headImageContext, _this_headImageContext1, _result_usage1;
const lastAssistantIndex = messages.findLastIndex((c)=>'assistant' === c.role);
logger.info('[ResponseAPI] lastAssistantIndex: ', lastAssistantIndex);
const inputs = (0, external_utils_js_namespaceObject.convertToResponseApiInput)(lastAssistantIndex > -1 ? messages.slice(lastAssistantIndex + 1) : messages);
const headImageMessageIndex = messages.findIndex(external_utils_js_namespaceObject.isMessageImage);
if ((null == (_this_headImageContext = this.headImageContext) ? void 0 : _this_headImageContext.responseIds.length) && (null == (_this_headImageContext1 = this.headImageContext) ? void 0 : _this_headImageContext1.messageIndex) !== headImageMessageIndex) {
logger.info('[ResponseAPI] should [delete]: ', this.headImageContext, 'headImageMessageIndex', headImageMessageIndex);
const headImageResponseId = this.headImageContext.responseIds.shift();
if (headImageResponseId) {
const deletedResponse = await openai.responses.delete(headImageResponseId, {
headers
});
logger.info('[ResponseAPI] [deletedResponse]: ', headImageResponseId, deletedResponse);
}
}
let result;
let responseId = previousResponseId;
for (const input of inputs){
const truncated = JSON.stringify([
input
], (key, value)=>{
if ('string' == typeof value && value.startsWith('data:image/')) return value.slice(0, 50) + '...[truncated]';
return value;
}, 2);
const responseParams = {
input: [
input
],
model,
temperature,
top_p,
stream: false,
max_output_tokens: max_tokens,
...responseId && {
previous_response_id: responseId
},
thinking: {
type: 'disabled'
}
};
logger.info('[ResponseAPI] [input]: ', truncated, 'previous_response_id', null == responseParams ? void 0 : responseParams.previous_response_id, 'headImageMessageIndex', headImageMessageIndex);
result = await openai.responses.create(responseParams, {
...options,
timeout: 30000,
headers
});
logger.info('[ResponseAPI] [result]: ', result);
responseId = null == result ? void 0 : result.id;
logger.info('[ResponseAPI] [responseId]: ', responseId);
if (responseId && (0, external_utils_js_namespaceObject.isMessageImage)(input)) {
var _this_headImageContext2;
this.headImageContext = {
messageIndex: headImageMessageIndex,
responseIds: [
...(null == (_this_headImageContext2 = this.headImageContext) ? void 0 : _this_headImageContext2.responseIds) || [],
responseId
]
};
}
logger.info('[ResponseAPI] [headImageContext]: ', this.headImageContext);
}
return {
prediction: (null == result ? void 0 : result.output_text) ?? '',
costTime: Date.now() - startTime,
costTokens: (null == result ? void 0 : null == (_result_usage1 = result.usage) ? void 0 : _result_usage1.total_tokens) ?? 0,
responseId
};
}
const result = await openai.chat.completions.create(createCompletionPramsThinkingVp, {
...options,
timeout: 30000,
headers
});
return {
prediction: (null == (_result_choices = result.choices) ? void 0 : null == (_result_choices_ = _result_choices[0]) ? void 0 : null == (_result_choices__message = _result_choices_.message) ? void 0 : _result_choices__message.content) ?? '',
costTime: Date.now() - startTime,
costTokens: (null == (_result_usage = result.usage) ? void 0 : _result_usage.total_tokens) ?? 0
};
}
async invoke(params) {
const { conversations, images, screenContext, scaleFactor, uiTarsVersion, headers, previousResponseId } = params;
const { logger, signal } = (0, useContext_js_namespaceObject.useContext)();
null == logger || logger.info(`[UITarsModel] invoke: screenContext=${JSON.stringify(screenContext)}, scaleFactor=${scaleFactor}, uiTarsVersion=${uiTarsVersion}, useResponsesApi=${this.modelConfig.useResponsesApi}`);
const maxPixels = uiTarsVersion === types_namespaceObject.UITarsModelVersion.V1_5 ? types_namespaceObject.MAX_PIXELS_V1_5 : uiTarsVersion === types_namespaceObject.UITarsModelVersion.DOUBAO_1_5_15B || uiTarsVersion === types_namespaceObject.UITarsModelVersion.DOUBAO_1_5_20B ? types_namespaceObject.MAX_PIXELS_DOUBAO : types_namespaceObject.MAX_PIXELS_V1_0;
const compressedImages = await Promise.all(images.map((image)=>(0, external_utils_js_namespaceObject.preprocessResizeImage)(image, maxPixels)));
const messages = (0, external_utils_js_namespaceObject.convertToOpenAIMessages)({
conversations,
images: compressedImages
});
const startTime = Date.now();
const result = await this.invokeModelProvider(uiTarsVersion, {
messages,
previousResponseId
}, {
signal
}, headers).catch((e)=>{
null == logger || logger.error('[UITarsModel] error', e);
throw e;
}).finally(()=>{
null == logger || logger.info(`[UITarsModel cost]: ${Date.now() - startTime}ms`);
});
if (!result.prediction) {
const err = new Error();
err.name = 'vlm response error';
err.stack = JSON.stringify(result) ?? 'no message';
null == logger || logger.error(err);
throw err;
}
const { prediction, costTime, costTokens, responseId } = result;
try {
const { parsed: parsedPredictions } = (0, action_parser_namespaceObject.actionParser)({
prediction,
factor: this.factors,
screenContext,
scaleFactor,
modelVer: uiTarsVersion
});
return {
prediction,
parsedPredictions,
costTime,
costTokens,
responseId
};
} catch (error) {
null == logger || logger.error('[UITarsModel] error', error);
return {
prediction,
parsedPredictions: [],
responseId
};
}
}
constructor(modelConfig){
super(), _define_property(this, "modelConfig", void 0), _define_property(this, "headImageContext", void 0), this.modelConfig = modelConfig, this.headImageContext = null;
this.modelConfig = modelConfig;
}
}
exports.UITarsModel = __webpack_exports__.UITarsModel;
for(var __webpack_i__ in __webpack_exports__)if (-1 === [
"UITarsModel"
].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
Object.defineProperty(exports, '__esModule', {
value: true
});
//# sourceMappingURL=Model.js.map