@ui-tars/action-parser
Version:
Action parser SDK for UI-TARS
120 lines (119 loc) • 3.92 kB
JavaScript
function actionParser(params) {
const { prediction, factor } = params;
const parsed = parseActionVlm(prediction, factor);
return {
parsed
};
}
function parseActionVlm(text, factor = 1e3, mode = "bc") {
let reflection = null;
let thought = null;
let actionStr = "";
text = text.trim();
if (mode === "bc") {
if (text.startsWith("Thought:")) {
const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action:|$)/);
if (thoughtMatch) {
thought = thoughtMatch[1].trim();
}
} else if (text.startsWith("Reflection:")) {
const reflectionMatch = text.match(
/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action:|$)/
);
if (reflectionMatch) {
thought = reflectionMatch[2].trim();
reflection = reflectionMatch[1].trim();
}
} else if (text.startsWith("Action_Summary:")) {
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
if (summaryMatch) {
thought = summaryMatch[1].trim();
}
}
if (!text.includes("Action:")) {
actionStr = text;
} else {
const actionParts = text.split("Action:");
actionStr = actionParts[actionParts.length - 1];
}
} else if (mode === "o1") {
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
const actionSummaryMatch = text.match(
/\nAction_Summary:\s*(.*?)\s*Action:/
);
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
const actionContent = actionMatch ? actionMatch[1] : null;
thought = `${thoughtContent}
<Action_Summary>
${actionSummaryContent}`;
actionStr = actionContent || "";
}
const allActions = actionStr.split("\n\n");
const actions = [];
for (const rawStr of allActions) {
const actionInstance = parseAction(rawStr.replace(/\n/g, String.raw`\n`).trimStart());
let actionType = "";
let actionInputs = {};
if (actionInstance) {
actionType = actionInstance.function;
const params = actionInstance.args;
actionInputs = {};
for (const [paramName, param] of Object.entries(params)) {
if (!param) continue;
const trimmedParam = param.trim();
actionInputs[paramName.trim()] = trimmedParam;
if (paramName.includes("start_box") || paramName.includes("end_box")) {
const oriBox = trimmedParam;
const numbers = oriBox.replace(/[()[\]]/g, "").split(",");
const floatNumbers = numbers.map(
(num) => Number.parseFloat(num) / factor
);
if (floatNumbers.length === 2) {
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
}
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
}
}
}
actions.push({
reflection,
thought: thought || "",
action_type: actionType,
action_inputs: actionInputs
});
}
return actions;
}
function parseAction(actionStr) {
try {
const functionPattern = /^(\w+)\((.*)\)$/;
const match = actionStr.trim().match(functionPattern);
if (!match) {
throw new Error("Not a function call");
}
const [_, functionName, argsStr] = match;
const kwargs = {};
if (argsStr.trim()) {
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
for (const pair of argPairs) {
const [key, ...valueParts] = pair.split("=");
if (!key) continue;
const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
kwargs[key.trim()] = value;
}
}
return {
function: functionName,
args: kwargs
};
} catch (e) {
console.error(`Failed to parse action '${actionStr}': ${e}`);
return null;
}
}
export {
actionParser,
parseActionVlm
};