@ui-tars/sdk
Version:
A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS
211 lines (210 loc) • 8.88 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
;
var __webpack_require__ = {};
(()=>{
__webpack_require__.d = (exports1, definition)=>{
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
enumerable: true,
get: definition[key]
});
};
})();
(()=>{
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
})();
(()=>{
__webpack_require__.r = (exports1)=>{
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
value: 'Module'
});
Object.defineProperty(exports1, '__esModule', {
value: true
});
};
})();
var __webpack_exports__ = {};
__webpack_require__.r(__webpack_exports__);
__webpack_require__.d(__webpack_exports__, {
convertToOpenAIMessages: ()=>convertToOpenAIMessages,
toVlmModelFormat: ()=>toVlmModelFormat,
convertToResponseApiInput: ()=>convertToResponseApiInput,
preprocessResizeImage: ()=>preprocessResizeImage,
processVlmParams: ()=>processVlmParams,
getSummary: ()=>getSummary,
isMessageImage: ()=>isMessageImage,
parseBoxToScreenCoords: ()=>parseBoxToScreenCoords,
replaceBase64Prefix: ()=>replaceBase64Prefix
});
const external_jimp_namespaceObject = require("jimp");
const constants_namespaceObject = require("@ui-tars/shared/constants");
const external_constants_js_namespaceObject = require("./constants.js");
const parseBoxToScreenCoords = ({ boxStr, screenWidth, screenHeight, factors = external_constants_js_namespaceObject.DEFAULT_FACTORS })=>{
if (!boxStr) return {
x: null,
y: null
};
const coords = boxStr.replace('[', '').replace(']', '').split(',').map((num)=>parseFloat(num.trim()));
const [x1, y1, x2 = x1, y2 = y1] = coords;
const [widthFactor, heightFactor] = factors;
return {
x: Math.round((x1 + x2) / 2 * screenWidth * widthFactor) / widthFactor,
y: Math.round((y1 + y2) / 2 * screenHeight * heightFactor) / heightFactor
};
};
const processVlmParams = (conversations, images, maxImageLength = constants_namespaceObject.MAX_IMAGE_LENGTH)=>{
if (images.length > maxImageLength) {
const excessCount = images.length - maxImageLength;
images = images.slice(excessCount);
let imageCountToRemove = excessCount;
conversations = conversations.filter((convo)=>{
if (imageCountToRemove > 0 && convo.value === constants_namespaceObject.IMAGE_PLACEHOLDER) {
imageCountToRemove--;
return false;
}
return true;
});
}
return {
images,
conversations
};
};
const toVlmModelFormat = ({ historyMessages, conversations, systemPrompt })=>{
const USER_INSTRUCTION_MARKER = '## User Instruction';
const history = formatHistoryMessages(historyMessages);
return {
conversations: conversations.map((conv, idx)=>{
if (0 === idx && 'human' === conv.from) {
let newValue = '';
if (systemPrompt.includes(USER_INSTRUCTION_MARKER)) {
const insertIndex = systemPrompt.lastIndexOf(USER_INSTRUCTION_MARKER);
const slicedPrefix = systemPrompt.slice(0, insertIndex);
const slicedSuffix = systemPrompt.slice(insertIndex);
newValue = slicedPrefix + (slicedPrefix.endsWith('\n') ? '' : '\n') + history + '\n' + slicedSuffix + (slicedSuffix.endsWith('\n') ? '' : '\n') + conv.value;
} else newValue = `${systemPrompt}\n${history}\n${USER_INSTRUCTION_MARKER}\n${conv.value}`;
return {
from: conv.from,
value: newValue
};
}
return {
from: conv.from,
value: conv.value
};
}),
images: conversations.filter((conv)=>conv.value === constants_namespaceObject.IMAGE_PLACEHOLDER && !!conv.screenshotBase64).map((conv)=>conv.screenshotBase64)
};
};
const getSummary = (prediction)=>prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, '').trim();
const convertToOpenAIMessages = ({ conversations, images })=>{
const messages = [];
let imageIndex = 0;
conversations.forEach((conv)=>{
if (conv.value === constants_namespaceObject.IMAGE_PLACEHOLDER) {
if (imageIndex < images.length) {
messages.push({
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${images[imageIndex]}`
}
}
]
});
imageIndex++;
}
} else messages.push({
role: 'human' === conv.from ? 'user' : 'assistant',
content: conv.value
});
});
return messages;
};
function replaceBase64Prefix(base64) {
return base64.replace(/^data:image\/\w+;base64,/, '');
}
async function preprocessResizeImage(image_base64, maxPixels) {
try {
const imageBuffer = Buffer.from(image_base64, 'base64');
const image = await external_jimp_namespaceObject.Jimp.read(imageBuffer);
const { width, height } = image.bitmap;
const currentPixels = width * height;
if (currentPixels > maxPixels) {
const resizeFactor = Math.sqrt(maxPixels / currentPixels);
const newWidth = Math.floor(width * resizeFactor);
const newHeight = Math.floor(height * resizeFactor);
const resized = await image.resize({
w: newWidth,
h: newHeight
}).getBuffer('image/png', {
quality: 60
});
return resized.toString('base64');
}
const base64 = await image.getBase64('image/png', {
quality: 60
});
return replaceBase64Prefix(base64);
} catch (error) {
console.error('preprocessResizeImage error:', error);
throw error;
}
}
function formatHistoryMessages(messages) {
const lastMessages = messages.slice(-30);
const lines = lastMessages.map((msg)=>{
const role = 'human' === msg.from ? 'human' : 'assistant';
return `${role}: ${msg.value}`;
});
return '## History Messages\n' + lines.join('\n') + '\n';
}
const convertToResponseApiInput = (messages)=>messages.map((message)=>{
if (Array.isArray(null == message ? void 0 : message.content) && (null == message ? void 0 : message.content.length) > 0) {
const content = message.content.map((item)=>{
var _item_image_url;
if ('image_url' === item.type && (null == (_item_image_url = item.image_url) ? void 0 : _item_image_url.url)) return {
type: 'input_image',
image_url: item.image_url.url
};
return item;
});
return {
role: message.role,
content
};
}
return message;
});
const isMessageImage = (c)=>'role' in c && 'user' === c.role && Array.isArray(c.content) && c.content.some((item)=>{
var _item_image_url;
return 'image_url' === item.type && (null == (_item_image_url = item.image_url) ? void 0 : _item_image_url.url) || 'input_image' === item.type && item.image_url;
});
exports.convertToOpenAIMessages = __webpack_exports__.convertToOpenAIMessages;
exports.convertToResponseApiInput = __webpack_exports__.convertToResponseApiInput;
exports.getSummary = __webpack_exports__.getSummary;
exports.isMessageImage = __webpack_exports__.isMessageImage;
exports.parseBoxToScreenCoords = __webpack_exports__.parseBoxToScreenCoords;
exports.preprocessResizeImage = __webpack_exports__.preprocessResizeImage;
exports.processVlmParams = __webpack_exports__.processVlmParams;
exports.replaceBase64Prefix = __webpack_exports__.replaceBase64Prefix;
exports.toVlmModelFormat = __webpack_exports__.toVlmModelFormat;
for(var __webpack_i__ in __webpack_exports__)if (-1 === [
"convertToOpenAIMessages",
"convertToResponseApiInput",
"getSummary",
"isMessageImage",
"parseBoxToScreenCoords",
"preprocessResizeImage",
"processVlmParams",
"replaceBase64Prefix",
"toVlmModelFormat"
].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
Object.defineProperty(exports, '__esModule', {
value: true
});
//# sourceMappingURL=utils.js.map