@ui-tars/sdk
Version:
A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS
206 lines (205 loc) • 9.4 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import openai_0 from "openai";
import { actionParser } from "@ui-tars/action-parser";
import { useContext } from "./context/useContext.mjs";
import { Model } from "./types.mjs";
import { convertToOpenAIMessages, convertToResponseApiInput, isMessageImage, preprocessResizeImage } from "./utils.mjs";
import { DEFAULT_FACTORS } from "./constants.mjs";
import { MAX_PIXELS_DOUBAO, MAX_PIXELS_V1_0, MAX_PIXELS_V1_5, UITarsModelVersion } from "@ui-tars/shared/types";
function _define_property(obj, key, value) {
if (key in obj) Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
else obj[key] = value;
return obj;
}
class UITarsModel extends Model {
get useResponsesApi() {
return this.modelConfig.useResponsesApi ?? false;
}
get factors() {
return DEFAULT_FACTORS;
}
get modelName() {
return this.modelConfig.model ?? 'unknown';
}
reset() {
this.headImageContext = null;
}
async invokeModelProvider(uiTarsVersion = UITarsModelVersion.V1_0, params, options, headers) {
var _result_choices__message, _result_choices_, _result_choices, _result_usage;
const { logger } = useContext();
const { messages, previousResponseId } = params;
const { baseURL, apiKey, model, max_tokens = uiTarsVersion == UITarsModelVersion.V1_5 ? 65535 : 1000, temperature = 0, top_p = 0.7, ...restOptions } = this.modelConfig;
const openai = new openai_0({
...restOptions,
maxRetries: 0,
baseURL,
apiKey
});
const createCompletionPrams = {
model,
messages,
stream: false,
seed: null,
stop: null,
frequency_penalty: null,
presence_penalty: null,
max_tokens,
temperature,
top_p
};
const createCompletionPramsThinkingVp = {
...createCompletionPrams,
thinking: {
type: 'disabled'
}
};
const startTime = Date.now();
if (this.modelConfig.useResponsesApi) {
var _this_headImageContext, _this_headImageContext1, _result_usage1;
const lastAssistantIndex = messages.findLastIndex((c)=>'assistant' === c.role);
logger.info('[ResponseAPI] lastAssistantIndex: ', lastAssistantIndex);
const inputs = convertToResponseApiInput(lastAssistantIndex > -1 ? messages.slice(lastAssistantIndex + 1) : messages);
const headImageMessageIndex = messages.findIndex(isMessageImage);
if ((null == (_this_headImageContext = this.headImageContext) ? void 0 : _this_headImageContext.responseIds.length) && (null == (_this_headImageContext1 = this.headImageContext) ? void 0 : _this_headImageContext1.messageIndex) !== headImageMessageIndex) {
logger.info('[ResponseAPI] should [delete]: ', this.headImageContext, 'headImageMessageIndex', headImageMessageIndex);
const headImageResponseId = this.headImageContext.responseIds.shift();
if (headImageResponseId) {
const deletedResponse = await openai.responses.delete(headImageResponseId, {
headers
});
logger.info('[ResponseAPI] [deletedResponse]: ', headImageResponseId, deletedResponse);
}
}
let result;
let responseId = previousResponseId;
for (const input of inputs){
const truncated = JSON.stringify([
input
], (key, value)=>{
if ('string' == typeof value && value.startsWith('data:image/')) return value.slice(0, 50) + '...[truncated]';
return value;
}, 2);
const responseParams = {
input: [
input
],
model,
temperature,
top_p,
stream: false,
max_output_tokens: max_tokens,
...responseId && {
previous_response_id: responseId
},
thinking: {
type: 'disabled'
}
};
logger.info('[ResponseAPI] [input]: ', truncated, 'previous_response_id', null == responseParams ? void 0 : responseParams.previous_response_id, 'headImageMessageIndex', headImageMessageIndex);
result = await openai.responses.create(responseParams, {
...options,
timeout: 30000,
headers
});
logger.info('[ResponseAPI] [result]: ', result);
responseId = null == result ? void 0 : result.id;
logger.info('[ResponseAPI] [responseId]: ', responseId);
if (responseId && isMessageImage(input)) {
var _this_headImageContext2;
this.headImageContext = {
messageIndex: headImageMessageIndex,
responseIds: [
...(null == (_this_headImageContext2 = this.headImageContext) ? void 0 : _this_headImageContext2.responseIds) || [],
responseId
]
};
}
logger.info('[ResponseAPI] [headImageContext]: ', this.headImageContext);
}
return {
prediction: (null == result ? void 0 : result.output_text) ?? '',
costTime: Date.now() - startTime,
costTokens: (null == result ? void 0 : null == (_result_usage1 = result.usage) ? void 0 : _result_usage1.total_tokens) ?? 0,
responseId
};
}
const result = await openai.chat.completions.create(createCompletionPramsThinkingVp, {
...options,
timeout: 30000,
headers
});
return {
prediction: (null == (_result_choices = result.choices) ? void 0 : null == (_result_choices_ = _result_choices[0]) ? void 0 : null == (_result_choices__message = _result_choices_.message) ? void 0 : _result_choices__message.content) ?? '',
costTime: Date.now() - startTime,
costTokens: (null == (_result_usage = result.usage) ? void 0 : _result_usage.total_tokens) ?? 0
};
}
async invoke(params) {
const { conversations, images, screenContext, scaleFactor, uiTarsVersion, headers, previousResponseId } = params;
const { logger, signal } = useContext();
null == logger || logger.info(`[UITarsModel] invoke: screenContext=${JSON.stringify(screenContext)}, scaleFactor=${scaleFactor}, uiTarsVersion=${uiTarsVersion}, useResponsesApi=${this.modelConfig.useResponsesApi}`);
const maxPixels = uiTarsVersion === UITarsModelVersion.V1_5 ? MAX_PIXELS_V1_5 : uiTarsVersion === UITarsModelVersion.DOUBAO_1_5_15B || uiTarsVersion === UITarsModelVersion.DOUBAO_1_5_20B ? MAX_PIXELS_DOUBAO : MAX_PIXELS_V1_0;
const compressedImages = await Promise.all(images.map((image)=>preprocessResizeImage(image, maxPixels)));
const messages = convertToOpenAIMessages({
conversations,
images: compressedImages
});
const startTime = Date.now();
const result = await this.invokeModelProvider(uiTarsVersion, {
messages,
previousResponseId
}, {
signal
}, headers).catch((e)=>{
null == logger || logger.error('[UITarsModel] error', e);
throw e;
}).finally(()=>{
null == logger || logger.info(`[UITarsModel cost]: ${Date.now() - startTime}ms`);
});
if (!result.prediction) {
const err = new Error();
err.name = 'vlm response error';
err.stack = JSON.stringify(result) ?? 'no message';
null == logger || logger.error(err);
throw err;
}
const { prediction, costTime, costTokens, responseId } = result;
try {
const { parsed: parsedPredictions } = actionParser({
prediction,
factor: this.factors,
screenContext,
scaleFactor,
modelVer: uiTarsVersion
});
return {
prediction,
parsedPredictions,
costTime,
costTokens,
responseId
};
} catch (error) {
null == logger || logger.error('[UITarsModel] error', error);
return {
prediction,
parsedPredictions: [],
responseId
};
}
}
constructor(modelConfig){
super(), _define_property(this, "modelConfig", void 0), _define_property(this, "headImageContext", void 0), this.modelConfig = modelConfig, this.headImageContext = null;
this.modelConfig = modelConfig;
}
}
export { UITarsModel };
//# sourceMappingURL=Model.mjs.map