UNPKG

@gui-agent/action-parser

Version:

Action parser SDK for general action parser

227 lines (226 loc) 10.7 kB
/** * Copyright (c) 2025 Bytedance, Inc. and its affiliates. * SPDX-License-Identifier: Apache-2.0 */ import { IMAGE_FACTOR, MAX_PIXELS_V1_5, MAX_RATIO, MIN_PIXELS, UITarsModelVersion } from "@ui-tars/shared/types"; import lodash_isnumber from "lodash.isnumber"; function roundByFactor(num, factor) { return Math.round(num / factor) * factor; } function floorByFactor(num, factor) { return Math.floor(num / factor) * factor; } function ceilByFactor(num, factor) { return Math.ceil(num / factor) * factor; } function smartResizeForV15(height, width, maxRatio = MAX_RATIO, factor = IMAGE_FACTOR, minPixels = MIN_PIXELS, maxPixels = MAX_PIXELS_V1_5) { if (Math.max(height, width) / Math.min(height, width) > maxRatio) { console.error(`absolute aspect ratio must be smaller than ${maxRatio}, got ${Math.max(height, width) / Math.min(height, width)}`); return null; } let wBar = Math.max(factor, roundByFactor(width, factor)); let hBar = Math.max(factor, roundByFactor(height, factor)); if (hBar * wBar > maxPixels) { const beta = Math.sqrt(height * width / maxPixels); hBar = floorByFactor(height / beta, factor); wBar = floorByFactor(width / beta, factor); } else if (hBar * wBar < minPixels) { const beta = Math.sqrt(minPixels / (height * width)); hBar = ceilByFactor(height * beta, factor); wBar = ceilByFactor(width * beta, factor); } return [ wBar, hBar ]; } function actionParser(params) { const { prediction, factor, mode, screenContext, scaleFactor, modelVer } = params; const parsed = parseActionVlm(prediction, Array.isArray(factor) ? factor : [ factor, factor ], mode, screenContext, scaleFactor, modelVer); return { parsed }; } function actionStringParser(prediction) { const text = prediction.trim(); let actionStr = ''; const thinkMatch = text.match(/<think[^>]*>([\s\S]*?)<\/think[^>]*>/i); const computerEnvMatch = text.match(/<computer_env>([\s\S]*?)<\/computer_env>/i); if (thinkMatch && computerEnvMatch) { if (thinkMatch) thinkMatch[1].trim(); if (computerEnvMatch) { actionStr = computerEnvMatch[1].trim(); actionStr = actionStr.replace(/^Action:\s*/i, ''); } } if ('' !== actionStr) return actionStr.split('\n\n'); if (text.includes('Thought:')) { const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action[::]|$)/); if (thoughtMatch) thoughtMatch[1].trim(); } else if (text.startsWith('Reflection:')) { const reflectionMatch = text.match(/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action[::]|$)/); if (reflectionMatch) { reflectionMatch[2].trim(); reflectionMatch[1].trim(); } } else if (text.startsWith('Action_Summary:')) { const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action[::]|$)/); if (summaryMatch) summaryMatch[1].trim(); } if ([ 'Action:', "Action\uFF1A" ].some((keyword)=>text.includes(keyword))) { const actionParts = text.split(/Action[::]/); actionStr = actionParts[actionParts.length - 1]; } if ('' !== actionStr) return actionStr.split('\n\n').map((str)=>str.trim()); const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/); const actionSummaryMatch = text.match(/\nAction_Summary:\s*(.*?)\s*Action:/); const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/); thoughtMatch && thoughtMatch[1]; actionSummaryMatch && actionSummaryMatch[1]; const actionContent = actionMatch ? actionMatch[1] : null; actionStr = actionContent || ''; if ('' !== actionStr) return actionStr.split('\n\n'); return []; } function parseActionVlm(text, factors = [ 1000, 1000 ], mode = 'bc', screenContext, scaleFactor, modelVer = UITarsModelVersion.V1_0) { let reflection = null; let thought = null; let actionStr = ''; let smartResizeFactors = null; if (modelVer === UITarsModelVersion.V1_5 && (null == screenContext ? void 0 : screenContext.height) && (null == screenContext ? void 0 : screenContext.width)) smartResizeFactors = smartResizeForV15(screenContext.height, screenContext.width); text = text.trim(); if ('bc' === mode) { if (text.includes('Thought:')) { const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action[::]|$)/); if (thoughtMatch) thought = thoughtMatch[1].trim(); } else if (text.startsWith('Reflection:')) { const reflectionMatch = text.match(/Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action[::]|$)/); if (reflectionMatch) { thought = reflectionMatch[2].trim(); reflection = reflectionMatch[1].trim(); } } else if (text.startsWith('Action_Summary:')) { const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action[::]|$)/); if (summaryMatch) thought = summaryMatch[1].trim(); } if ([ 'Action:', "Action\uFF1A" ].some((keyword)=>text.includes(keyword))) { const actionParts = text.split(/Action[::]/); actionStr = actionParts[actionParts.length - 1]; } else actionStr = text; } else if ('o1' === mode) { const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/); const actionSummaryMatch = text.match(/\nAction_Summary:\s*(.*?)\s*Action:/); const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/); const thoughtContent = thoughtMatch ? thoughtMatch[1] : null; const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null; const actionContent = actionMatch ? actionMatch[1] : null; thought = `${thoughtContent}\n<Action_Summary>\n${actionSummaryContent}`; actionStr = actionContent || ''; } const thinkMatch = text.match(/<think[^>]*>([\s\S]*?)<\/think[^>]*>/i); const computerEnvMatch = text.match(/<computer_env>([\s\S]*?)<\/computer_env>/i); if (thinkMatch && computerEnvMatch) { if (thinkMatch) thought = thinkMatch[1].trim(); if (computerEnvMatch) { actionStr = computerEnvMatch[1].trim(); actionStr = actionStr.replace(/^Action:\s*/i, ''); } } const allActions = actionStr.split('\n\n'); const actions = []; for (const rawStr of allActions){ const actionInstance = parseAction(rawStr.replace(/\n/g, String.raw`\n`).trimStart()); let actionType = ''; let actionInputs = {}; if (actionInstance) { actionType = actionInstance.function; const params = actionInstance.args; actionInputs = {}; for (const [paramName, param] of Object.entries(params)){ if (!param) continue; const trimmedParam = param.trim(); if (paramName.includes('start_box') || paramName.includes('end_box')) { const oriBox = trimmedParam; const numbers = oriBox.replace(/[()[\]]/g, '').split(',').filter((ori)=>'' !== ori); const floatNumbers = numbers.map((num, idx)=>{ const factorIndex = idx % 2; if (modelVer === UITarsModelVersion.V1_5 && smartResizeFactors) return Number.parseFloat(num) / smartResizeFactors[factorIndex]; return Number.parseFloat(num) / factors[factorIndex]; }); if (2 === floatNumbers.length) floatNumbers.push(floatNumbers[0], floatNumbers[1]); actionInputs[paramName.trim()] = JSON.stringify(floatNumbers); if ((null == screenContext ? void 0 : screenContext.width) && (null == screenContext ? void 0 : screenContext.height)) { const boxKey = paramName.includes('start_box') ? 'start_coords' : 'end_coords'; const [x1, y1, x2 = x1, y2 = y1] = floatNumbers; const [widthFactor, heightFactor] = factors; actionInputs[boxKey] = [ x1, y1, x2, y2 ].every(lodash_isnumber) ? [ Math.round((x1 + x2) / 2 * (null == screenContext ? void 0 : screenContext.width) * widthFactor) / widthFactor * (scaleFactor ?? 1), Math.round((y1 + y2) / 2 * (null == screenContext ? void 0 : screenContext.height) * heightFactor) / heightFactor * (scaleFactor ?? 1) ] : []; } } else actionInputs[paramName.trim()] = trimmedParam; } } actions.push({ reflection: reflection, thought: thought || '', action_type: actionType, action_inputs: actionInputs }); } return actions; } function parseAction(actionStr) { try { actionStr = actionStr.replace(/<\|box_start\|>|<\|box_end\|>/g, ''); actionStr = actionStr.replace(/(?<!start_|end_)point=/g, 'start_box=').replace(/start_point=/g, 'start_box=').replace(/end_point=/g, 'end_box='); const functionPattern = /^(\w+)\((.*)\)$/; const match = actionStr.trim().match(functionPattern); if (!match) throw new Error('Not a function call'); const [_, functionName, argsStr] = match; const kwargs = {}; if (argsStr.trim()) { const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || []; for (const pair of argPairs){ const [key, ...valueParts] = pair.split('='); if (!key) continue; let value = valueParts.join('=').trim().replace(/^['"]|['"]$/g, ''); if (value.includes('<bbox>')) { value = value.replace(/<bbox>|<\/bbox>/g, '').replace(/\s+/g, ','); value = `(${value})`; } if (value.includes('<point>')) { value = value.replace(/<point>|<\/point>/g, '').replace(/\s+/g, ','); value = `(${value})`; } kwargs[key.trim()] = value; } } return { function: functionName, args: kwargs }; } catch (e) { console.error(`Failed to parse action '${actionStr}': ${e}`); return null; } } export { actionParser, actionStringParser, parseActionVlm }; //# sourceMappingURL=actionParser.mjs.map