@gui-agent/action-parser
Version:
Action parser SDK for general action parser
227 lines (226 loc) • 10.7 kB
JavaScript
/**
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { IMAGE_FACTOR, MAX_PIXELS_V1_5, MAX_RATIO, MIN_PIXELS, UITarsModelVersion } from "@ui-tars/shared/types";
import lodash_isnumber from "lodash.isnumber";
function roundByFactor(num, factor) {
return Math.round(num / factor) * factor;
}
function floorByFactor(num, factor) {
return Math.floor(num / factor) * factor;
}
function ceilByFactor(num, factor) {
return Math.ceil(num / factor) * factor;
}
function smartResizeForV15(height, width, maxRatio = MAX_RATIO, factor = IMAGE_FACTOR, minPixels = MIN_PIXELS, maxPixels = MAX_PIXELS_V1_5) {
if (Math.max(height, width) / Math.min(height, width) > maxRatio) {
console.error(`absolute aspect ratio must be smaller than ${maxRatio}, got ${Math.max(height, width) / Math.min(height, width)}`);
return null;
}
let wBar = Math.max(factor, roundByFactor(width, factor));
let hBar = Math.max(factor, roundByFactor(height, factor));
if (hBar * wBar > maxPixels) {
const beta = Math.sqrt(height * width / maxPixels);
hBar = floorByFactor(height / beta, factor);
wBar = floorByFactor(width / beta, factor);
} else if (hBar * wBar < minPixels) {
const beta = Math.sqrt(minPixels / (height * width));
hBar = ceilByFactor(height * beta, factor);
wBar = ceilByFactor(width * beta, factor);
}
return [
wBar,
hBar
];
}
function actionParser(params) {
const { prediction, factor, mode, screenContext, scaleFactor, modelVer } = params;
const parsed = parseActionVlm(prediction, Array.isArray(factor) ? factor : [
factor,
factor
], mode, screenContext, scaleFactor, modelVer);
return {
parsed
};
}
function actionStringParser(prediction) {
const text = prediction.trim();
let actionStr = '';
const thinkMatch = text.match(/<think[^>]*>([\s\S]*?)<\/think[^>]*>/i);
const computerEnvMatch = text.match(/<computer_env>([\s\S]*?)<\/computer_env>/i);
if (thinkMatch && computerEnvMatch) {
if (thinkMatch) thinkMatch[1].trim();
if (computerEnvMatch) {
actionStr = computerEnvMatch[1].trim();
actionStr = actionStr.replace(/^Action:\s*/i, '');
}
}
if ('' !== actionStr) return actionStr.split('\n\n');
if (text.includes('Thought:')) {
const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (thoughtMatch) thoughtMatch[1].trim();
} else if (text.startsWith('Reflection:')) {
const reflectionMatch = text.match(/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (reflectionMatch) {
reflectionMatch[2].trim();
reflectionMatch[1].trim();
}
} else if (text.startsWith('Action_Summary:')) {
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action[::]|$)/);
if (summaryMatch) summaryMatch[1].trim();
}
if ([
'Action:',
"Action\uFF1A"
].some((keyword)=>text.includes(keyword))) {
const actionParts = text.split(/Action[::]/);
actionStr = actionParts[actionParts.length - 1];
}
if ('' !== actionStr) return actionStr.split('\n\n').map((str)=>str.trim());
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
const actionSummaryMatch = text.match(/\nAction_Summary:\s*(.*?)\s*Action:/);
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
thoughtMatch && thoughtMatch[1];
actionSummaryMatch && actionSummaryMatch[1];
const actionContent = actionMatch ? actionMatch[1] : null;
actionStr = actionContent || '';
if ('' !== actionStr) return actionStr.split('\n\n');
return [];
}
function parseActionVlm(text, factors = [
1000,
1000
], mode = 'bc', screenContext, scaleFactor, modelVer = UITarsModelVersion.V1_0) {
let reflection = null;
let thought = null;
let actionStr = '';
let smartResizeFactors = null;
if (modelVer === UITarsModelVersion.V1_5 && (null == screenContext ? void 0 : screenContext.height) && (null == screenContext ? void 0 : screenContext.width)) smartResizeFactors = smartResizeForV15(screenContext.height, screenContext.width);
text = text.trim();
if ('bc' === mode) {
if (text.includes('Thought:')) {
const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (thoughtMatch) thought = thoughtMatch[1].trim();
} else if (text.startsWith('Reflection:')) {
const reflectionMatch = text.match(/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action[::]|$)/);
if (reflectionMatch) {
thought = reflectionMatch[2].trim();
reflection = reflectionMatch[1].trim();
}
} else if (text.startsWith('Action_Summary:')) {
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action[::]|$)/);
if (summaryMatch) thought = summaryMatch[1].trim();
}
if ([
'Action:',
"Action\uFF1A"
].some((keyword)=>text.includes(keyword))) {
const actionParts = text.split(/Action[::]/);
actionStr = actionParts[actionParts.length - 1];
} else actionStr = text;
} else if ('o1' === mode) {
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
const actionSummaryMatch = text.match(/\nAction_Summary:\s*(.*?)\s*Action:/);
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
const actionContent = actionMatch ? actionMatch[1] : null;
thought = `${thoughtContent}\n<Action_Summary>\n${actionSummaryContent}`;
actionStr = actionContent || '';
}
const thinkMatch = text.match(/<think[^>]*>([\s\S]*?)<\/think[^>]*>/i);
const computerEnvMatch = text.match(/<computer_env>([\s\S]*?)<\/computer_env>/i);
if (thinkMatch && computerEnvMatch) {
if (thinkMatch) thought = thinkMatch[1].trim();
if (computerEnvMatch) {
actionStr = computerEnvMatch[1].trim();
actionStr = actionStr.replace(/^Action:\s*/i, '');
}
}
const allActions = actionStr.split('\n\n');
const actions = [];
for (const rawStr of allActions){
const actionInstance = parseAction(rawStr.replace(/\n/g, String.raw`\n`).trimStart());
let actionType = '';
let actionInputs = {};
if (actionInstance) {
actionType = actionInstance.function;
const params = actionInstance.args;
actionInputs = {};
for (const [paramName, param] of Object.entries(params)){
if (!param) continue;
const trimmedParam = param.trim();
if (paramName.includes('start_box') || paramName.includes('end_box')) {
const oriBox = trimmedParam;
const numbers = oriBox.replace(/[()[\]]/g, '').split(',').filter((ori)=>'' !== ori);
const floatNumbers = numbers.map((num, idx)=>{
const factorIndex = idx % 2;
if (modelVer === UITarsModelVersion.V1_5 && smartResizeFactors) return Number.parseFloat(num) / smartResizeFactors[factorIndex];
return Number.parseFloat(num) / factors[factorIndex];
});
if (2 === floatNumbers.length) floatNumbers.push(floatNumbers[0], floatNumbers[1]);
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
if ((null == screenContext ? void 0 : screenContext.width) && (null == screenContext ? void 0 : screenContext.height)) {
const boxKey = paramName.includes('start_box') ? 'start_coords' : 'end_coords';
const [x1, y1, x2 = x1, y2 = y1] = floatNumbers;
const [widthFactor, heightFactor] = factors;
actionInputs[boxKey] = [
x1,
y1,
x2,
y2
].every(lodash_isnumber) ? [
Math.round((x1 + x2) / 2 * (null == screenContext ? void 0 : screenContext.width) * widthFactor) / widthFactor * (scaleFactor ?? 1),
Math.round((y1 + y2) / 2 * (null == screenContext ? void 0 : screenContext.height) * heightFactor) / heightFactor * (scaleFactor ?? 1)
] : [];
}
} else actionInputs[paramName.trim()] = trimmedParam;
}
}
actions.push({
reflection: reflection,
thought: thought || '',
action_type: actionType,
action_inputs: actionInputs
});
}
return actions;
}
function parseAction(actionStr) {
try {
actionStr = actionStr.replace(/<\|box_start\|>|<\|box_end\|>/g, '');
actionStr = actionStr.replace(/(?