@gui-agent/action-parser
Version:
Action parser SDK for general action parser
277 lines (276 loc) • 12.7 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
"use strict";
var __webpack_require__ = {};
(()=>{
__webpack_require__.n = (module)=>{
var getter = module && module.__esModule ? ()=>module['default'] : ()=>module;
__webpack_require__.d(getter, {
a: getter
});
return getter;
};
})();
(()=>{
__webpack_require__.d = (exports1, definition)=>{
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
enumerable: true,
get: definition[key]
});
};
})();
(()=>{
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
})();
(()=>{
__webpack_require__.r = (exports1)=>{
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
value: 'Module'
});
Object.defineProperty(exports1, '__esModule', {
value: true
});
};
})();
var __webpack_exports__ = {};
__webpack_require__.r(__webpack_exports__);
__webpack_require__.d(__webpack_exports__, {
actionParser: ()=>actionParser,
parseActionVlm: ()=>parseActionVlm,
actionStringParser: ()=>actionStringParser
});
const types_namespaceObject = require("@ui-tars/shared/types");
const external_lodash_isnumber_namespaceObject = require("lodash.isnumber");
var external_lodash_isnumber_default = /*#__PURE__*/ __webpack_require__.n(external_lodash_isnumber_namespaceObject);
function roundByFactor(num, factor) {
return Math.round(num / factor) * factor;
}
function floorByFactor(num, factor) {
return Math.floor(num / factor) * factor;
}
function ceilByFactor(num, factor) {
return Math.ceil(num / factor) * factor;
}
function smartResizeForV15(height, width, maxRatio = types_namespaceObject.MAX_RATIO, factor = types_namespaceObject.IMAGE_FACTOR, minPixels = types_namespaceObject.MIN_PIXELS, maxPixels = types_namespaceObject.MAX_PIXELS_V1_5) {
if (Math.max(height, width) / Math.min(height, width) > maxRatio) {
console.error(`absolute aspect ratio must be smaller than ${maxRatio}, got ${Math.max(height, width) / Math.min(height, width)}`);
return null;
}
let wBar = Math.max(factor, roundByFactor(width, factor));
let hBar = Math.max(factor, roundByFactor(height, factor));
if (hBar * wBar > maxPixels) {
const beta = Math.sqrt(height * width / maxPixels);
hBar = floorByFactor(height / beta, factor);
wBar = floorByFactor(width / beta, factor);
} else if (hBar * wBar < minPixels) {
const beta = Math.sqrt(minPixels / (height * width));
hBar = ceilByFactor(height * beta, factor);
wBar = ceilByFactor(width * beta, factor);
}
return [
wBar,
hBar
];
}
function actionParser(params) {
const { prediction, factor, mode, screenContext, scaleFactor, modelVer } = params;
const parsed = parseActionVlm(prediction, Array.isArray(factor) ? factor : [
factor,
factor
], mode, screenContext, scaleFactor, modelVer);
return {
parsed
};
}
function actionStringParser(prediction) {
const text = prediction.trim();
let actionStr = '';
const thinkMatch = text.match(/<think[^>]*>([\s\S]*?)<\/think[^>]*>/i);
const computerEnvMatch = text.match(/<computer_env>([\s\S]*?)<\/computer_env>/i);
if (thinkMatch && computerEnvMatch) {
if (thinkMatch) thinkMatch[1].trim();
if (computerEnvMatch) {
actionStr = computerEnvMatch[1].trim();
actionStr = actionStr.replace(/^Action:\s*/i, '');
}
}
if ('' !== actionStr) return actionStr.split('\n\n');
if (text.includes('Thought:')) {
const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (thoughtMatch) thoughtMatch[1].trim();
} else if (text.startsWith('Reflection:')) {
const reflectionMatch = text.match(/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (reflectionMatch) {
reflectionMatch[2].trim();
reflectionMatch[1].trim();
}
} else if (text.startsWith('Action_Summary:')) {
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action[::]|$)/);
if (summaryMatch) summaryMatch[1].trim();
}
if ([
'Action:',
"Action\uFF1A"
].some((keyword)=>text.includes(keyword))) {
const actionParts = text.split(/Action[::]/);
actionStr = actionParts[actionParts.length - 1];
}
if ('' !== actionStr) return actionStr.split('\n\n').map((str)=>str.trim());
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
const actionSummaryMatch = text.match(/\nAction_Summary:\s*(.*?)\s*Action:/);
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
thoughtMatch && thoughtMatch[1];
actionSummaryMatch && actionSummaryMatch[1];
const actionContent = actionMatch ? actionMatch[1] : null;
actionStr = actionContent || '';
if ('' !== actionStr) return actionStr.split('\n\n');
return [];
}
function parseActionVlm(text, factors = [
1000,
1000
], mode = 'bc', screenContext, scaleFactor, modelVer = types_namespaceObject.UITarsModelVersion.V1_0) {
let reflection = null;
let thought = null;
let actionStr = '';
let smartResizeFactors = null;
if (modelVer === types_namespaceObject.UITarsModelVersion.V1_5 && (null == screenContext ? void 0 : screenContext.height) && (null == screenContext ? void 0 : screenContext.width)) smartResizeFactors = smartResizeForV15(screenContext.height, screenContext.width);
text = text.trim();
if ('bc' === mode) {
if (text.includes('Thought:')) {
const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (thoughtMatch) thought = thoughtMatch[1].trim();
} else if (text.startsWith('Reflection:')) {
const reflectionMatch = text.match(/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (reflectionMatch) {
thought = reflectionMatch[2].trim();
reflection = reflectionMatch[1].trim();
}
} else if (text.startsWith('Action_Summary:')) {
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action[::]|$)/);
if (summaryMatch) thought = summaryMatch[1].trim();
}
if ([
'Action:',
"Action\uFF1A"
].some((keyword)=>text.includes(keyword))) {
const actionParts = text.split(/Action[::]/);
actionStr = actionParts[actionParts.length - 1];
} else actionStr = text;
} else if ('o1' === mode) {
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
const actionSummaryMatch = text.match(/\nAction_Summary:\s*(.*?)\s*Action:/);
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
const actionContent = actionMatch ? actionMatch[1] : null;
thought = `${thoughtContent}\n<Action_Summary>\n${actionSummaryContent}`;
actionStr = actionContent || '';
}
const thinkMatch = text.match(/<think[^>]*>([\s\S]*?)<\/think[^>]*>/i);
const computerEnvMatch = text.match(/<computer_env>([\s\S]*?)<\/computer_env>/i);
if (thinkMatch && computerEnvMatch) {
if (thinkMatch) thought = thinkMatch[1].trim();
if (computerEnvMatch) {
actionStr = computerEnvMatch[1].trim();
actionStr = actionStr.replace(/^Action:\s*/i, '');
}
}
const allActions = actionStr.split('\n\n');
const actions = [];
for (const rawStr of allActions){
const actionInstance = parseAction(rawStr.replace(/\n/g, String.raw`\n`).trimStart());
let actionType = '';
let actionInputs = {};
if (actionInstance) {
actionType = actionInstance.function;
const params = actionInstance.args;
actionInputs = {};
for (const [paramName, param] of Object.entries(params)){
if (!param) continue;
const trimmedParam = param.trim();
if (paramName.includes('start_box') || paramName.includes('end_box')) {
const oriBox = trimmedParam;
const numbers = oriBox.replace(/[()[\]]/g, '').split(',').filter((ori)=>'' !== ori);
const floatNumbers = numbers.map((num, idx)=>{
const factorIndex = idx % 2;
if (modelVer === types_namespaceObject.UITarsModelVersion.V1_5 && smartResizeFactors) return Number.parseFloat(num) / smartResizeFactors[factorIndex];
return Number.parseFloat(num) / factors[factorIndex];
});
if (2 === floatNumbers.length) floatNumbers.push(floatNumbers[0], floatNumbers[1]);
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
if ((null == screenContext ? void 0 : screenContext.width) && (null == screenContext ? void 0 : screenContext.height)) {
const boxKey = paramName.includes('start_box') ? 'start_coords' : 'end_coords';
const [x1, y1, x2 = x1, y2 = y1] = floatNumbers;
const [widthFactor, heightFactor] = factors;
actionInputs[boxKey] = [
x1,
y1,
x2,
y2
].every(external_lodash_isnumber_default()) ? [
Math.round((x1 + x2) / 2 * (null == screenContext ? void 0 : screenContext.width) * widthFactor) / widthFactor * (scaleFactor ?? 1),
Math.round((y1 + y2) / 2 * (null == screenContext ? void 0 : screenContext.height) * heightFactor) / heightFactor * (scaleFactor ?? 1)
] : [];
}
} else actionInputs[paramName.trim()] = trimmedParam;
}
}
actions.push({
reflection: reflection,
thought: thought || '',
action_type: actionType,
action_inputs: actionInputs
});
}
return actions;
}
function parseAction(actionStr) {
try {
actionStr = actionStr.replace(/<\|box_start\|>|<\|box_end\|>/g, '');
actionStr = actionStr.replace(/(?