UNPKG

@ui-tars/sdk

Version:

A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS

211 lines (210 loc) 8.88 kB
/** * Copyright (c) 2025 Bytedance, Inc. and its affiliates. * SPDX-License-Identifier: Apache-2.0 */ "use strict"; var __webpack_require__ = {}; (()=>{ __webpack_require__.d = (exports1, definition)=>{ for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, { enumerable: true, get: definition[key] }); }; })(); (()=>{ __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop); })(); (()=>{ __webpack_require__.r = (exports1)=>{ if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, { value: 'Module' }); Object.defineProperty(exports1, '__esModule', { value: true }); }; })(); var __webpack_exports__ = {}; __webpack_require__.r(__webpack_exports__); __webpack_require__.d(__webpack_exports__, { convertToOpenAIMessages: ()=>convertToOpenAIMessages, toVlmModelFormat: ()=>toVlmModelFormat, convertToResponseApiInput: ()=>convertToResponseApiInput, preprocessResizeImage: ()=>preprocessResizeImage, processVlmParams: ()=>processVlmParams, getSummary: ()=>getSummary, isMessageImage: ()=>isMessageImage, parseBoxToScreenCoords: ()=>parseBoxToScreenCoords, replaceBase64Prefix: ()=>replaceBase64Prefix }); const external_jimp_namespaceObject = require("jimp"); const constants_namespaceObject = require("@ui-tars/shared/constants"); const external_constants_js_namespaceObject = require("./constants.js"); const parseBoxToScreenCoords = ({ boxStr, screenWidth, screenHeight, factors = external_constants_js_namespaceObject.DEFAULT_FACTORS })=>{ if (!boxStr) return { x: null, y: null }; const coords = boxStr.replace('[', '').replace(']', '').split(',').map((num)=>parseFloat(num.trim())); const [x1, y1, x2 = x1, y2 = y1] = coords; const [widthFactor, heightFactor] = factors; return { x: Math.round((x1 + x2) / 2 * screenWidth * widthFactor) / widthFactor, y: Math.round((y1 + y2) / 2 * screenHeight * heightFactor) / heightFactor }; }; const processVlmParams = (conversations, images, maxImageLength = constants_namespaceObject.MAX_IMAGE_LENGTH)=>{ if (images.length > maxImageLength) { const excessCount = images.length - maxImageLength; images = images.slice(excessCount); let imageCountToRemove = excessCount; conversations = conversations.filter((convo)=>{ if (imageCountToRemove > 0 && convo.value === constants_namespaceObject.IMAGE_PLACEHOLDER) { imageCountToRemove--; return false; } return true; }); } return { images, conversations }; }; const toVlmModelFormat = ({ historyMessages, conversations, systemPrompt })=>{ const USER_INSTRUCTION_MARKER = '## User Instruction'; const history = formatHistoryMessages(historyMessages); return { conversations: conversations.map((conv, idx)=>{ if (0 === idx && 'human' === conv.from) { let newValue = ''; if (systemPrompt.includes(USER_INSTRUCTION_MARKER)) { const insertIndex = systemPrompt.lastIndexOf(USER_INSTRUCTION_MARKER); const slicedPrefix = systemPrompt.slice(0, insertIndex); const slicedSuffix = systemPrompt.slice(insertIndex); newValue = slicedPrefix + (slicedPrefix.endsWith('\n') ? '' : '\n') + history + '\n' + slicedSuffix + (slicedSuffix.endsWith('\n') ? '' : '\n') + conv.value; } else newValue = `${systemPrompt}\n${history}\n${USER_INSTRUCTION_MARKER}\n${conv.value}`; return { from: conv.from, value: newValue }; } return { from: conv.from, value: conv.value }; }), images: conversations.filter((conv)=>conv.value === constants_namespaceObject.IMAGE_PLACEHOLDER && !!conv.screenshotBase64).map((conv)=>conv.screenshotBase64) }; }; const getSummary = (prediction)=>prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, '').trim(); const convertToOpenAIMessages = ({ conversations, images })=>{ const messages = []; let imageIndex = 0; conversations.forEach((conv)=>{ if (conv.value === constants_namespaceObject.IMAGE_PLACEHOLDER) { if (imageIndex < images.length) { messages.push({ role: 'user', content: [ { type: 'image_url', image_url: { url: `data:image/png;base64,${images[imageIndex]}` } } ] }); imageIndex++; } } else messages.push({ role: 'human' === conv.from ? 'user' : 'assistant', content: conv.value }); }); return messages; }; function replaceBase64Prefix(base64) { return base64.replace(/^data:image\/\w+;base64,/, ''); } async function preprocessResizeImage(image_base64, maxPixels) { try { const imageBuffer = Buffer.from(image_base64, 'base64'); const image = await external_jimp_namespaceObject.Jimp.read(imageBuffer); const { width, height } = image.bitmap; const currentPixels = width * height; if (currentPixels > maxPixels) { const resizeFactor = Math.sqrt(maxPixels / currentPixels); const newWidth = Math.floor(width * resizeFactor); const newHeight = Math.floor(height * resizeFactor); const resized = await image.resize({ w: newWidth, h: newHeight }).getBuffer('image/png', { quality: 60 }); return resized.toString('base64'); } const base64 = await image.getBase64('image/png', { quality: 60 }); return replaceBase64Prefix(base64); } catch (error) { console.error('preprocessResizeImage error:', error); throw error; } } function formatHistoryMessages(messages) { const lastMessages = messages.slice(-30); const lines = lastMessages.map((msg)=>{ const role = 'human' === msg.from ? 'human' : 'assistant'; return `${role}: ${msg.value}`; }); return '## History Messages\n' + lines.join('\n') + '\n'; } const convertToResponseApiInput = (messages)=>messages.map((message)=>{ if (Array.isArray(null == message ? void 0 : message.content) && (null == message ? void 0 : message.content.length) > 0) { const content = message.content.map((item)=>{ var _item_image_url; if ('image_url' === item.type && (null == (_item_image_url = item.image_url) ? void 0 : _item_image_url.url)) return { type: 'input_image', image_url: item.image_url.url }; return item; }); return { role: message.role, content }; } return message; }); const isMessageImage = (c)=>'role' in c && 'user' === c.role && Array.isArray(c.content) && c.content.some((item)=>{ var _item_image_url; return 'image_url' === item.type && (null == (_item_image_url = item.image_url) ? void 0 : _item_image_url.url) || 'input_image' === item.type && item.image_url; }); exports.convertToOpenAIMessages = __webpack_exports__.convertToOpenAIMessages; exports.convertToResponseApiInput = __webpack_exports__.convertToResponseApiInput; exports.getSummary = __webpack_exports__.getSummary; exports.isMessageImage = __webpack_exports__.isMessageImage; exports.parseBoxToScreenCoords = __webpack_exports__.parseBoxToScreenCoords; exports.preprocessResizeImage = __webpack_exports__.preprocessResizeImage; exports.processVlmParams = __webpack_exports__.processVlmParams; exports.replaceBase64Prefix = __webpack_exports__.replaceBase64Prefix; exports.toVlmModelFormat = __webpack_exports__.toVlmModelFormat; for(var __webpack_i__ in __webpack_exports__)if (-1 === [ "convertToOpenAIMessages", "convertToResponseApiInput", "getSummary", "isMessageImage", "parseBoxToScreenCoords", "preprocessResizeImage", "processVlmParams", "replaceBase64Prefix", "toVlmModelFormat" ].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__]; Object.defineProperty(exports, '__esModule', { value: true }); //# sourceMappingURL=utils.js.map