UNPKG

@ui-tars/action-parser

Version:
1 lines 16.9 kB
{"version":3,"file":"actionParser.mjs","sources":["webpack://@ui-tars/action-parser/./src/actionParser.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport {\n ActionInputs,\n PredictionParsed,\n UITarsModelVersion,\n MAX_RATIO,\n IMAGE_FACTOR,\n MIN_PIXELS,\n MAX_PIXELS_V1_5,\n} from '@ui-tars/shared/types';\nimport isNumber from 'lodash.isnumber';\n\nfunction roundByFactor(num: number, factor: number): number {\n return Math.round(num / factor) * factor;\n}\n\nfunction floorByFactor(num: number, factor: number): number {\n return Math.floor(num / factor) * factor;\n}\n\nfunction ceilByFactor(num: number, factor: number): number {\n return Math.ceil(num / factor) * factor;\n}\n\nfunction smartResizeForV15(\n height: number,\n width: number,\n maxRatio: number = MAX_RATIO,\n factor: number = IMAGE_FACTOR,\n minPixels: number = MIN_PIXELS,\n maxPixels: number = MAX_PIXELS_V1_5,\n): [number, number] | null {\n if (Math.max(height, width) / Math.min(height, width) > maxRatio) {\n console.error(\n `absolute aspect ratio must be smaller than ${maxRatio}, got ${\n Math.max(height, width) / Math.min(height, width)\n }`,\n );\n return null;\n }\n\n let wBar = Math.max(factor, roundByFactor(width, factor));\n let hBar = Math.max(factor, roundByFactor(height, factor));\n\n if (hBar * wBar > maxPixels) {\n const beta = Math.sqrt((height * width) / maxPixels);\n hBar = floorByFactor(height / beta, factor);\n wBar = floorByFactor(width / beta, factor);\n } else if (hBar * wBar < minPixels) {\n const beta = Math.sqrt(minPixels / (height * width));\n hBar = ceilByFactor(height * beta, factor);\n wBar = ceilByFactor(width * beta, factor);\n }\n\n return [wBar, hBar];\n}\n\nexport function actionParser(params: {\n prediction: string;\n /** [widthFactor, heightFactor] */\n factor: number | [number, number];\n screenContext?: {\n width: number;\n height: number;\n };\n scaleFactor?: number;\n mode?: 'bc' | 'o1';\n modelVer?: UITarsModelVersion;\n}): {\n parsed: PredictionParsed[];\n} {\n const { prediction, factor, mode, screenContext, scaleFactor, modelVer } =\n params;\n\n const parsed = parseActionVlm(\n prediction,\n Array.isArray(factor) ? factor : [factor, factor],\n mode,\n screenContext,\n scaleFactor,\n modelVer,\n );\n\n return {\n parsed,\n };\n}\n\nexport function parseActionVlm(\n text: string,\n factors: [number, number] = [1000, 1000],\n mode: 'bc' | 'o1' = 'bc',\n screenContext?: {\n width: number;\n height: number;\n },\n scaleFactor?: number,\n modelVer: UITarsModelVersion = UITarsModelVersion.V1_0,\n): PredictionParsed[] {\n let reflection: string | null = null;\n let thought: string | null = null;\n let actionStr = '';\n\n let smartResizeFactors: [number, number] | null = null;\n if (\n modelVer === UITarsModelVersion.V1_5 &&\n screenContext?.height &&\n screenContext?.width\n ) {\n smartResizeFactors = smartResizeForV15(\n screenContext.height,\n screenContext.width,\n );\n }\n\n text = text.trim();\n if (mode === 'bc') {\n // Parse thought/reflection based on different text patterns\n if (text.includes('Thought:')) {\n const thoughtMatch = text.match(\n /Thought: ([\\s\\S]+?)(?=\\s*Action[::]|$)/,\n );\n\n if (thoughtMatch) {\n thought = thoughtMatch[1].trim();\n }\n } else if (text.startsWith('Reflection:')) {\n const reflectionMatch = text.match(\n /Reflection: ([\\s\\S]+?)Action_Summary: ([\\s\\S]+?)(?=\\s*Action[::]|$)/,\n );\n if (reflectionMatch) {\n thought = reflectionMatch[2].trim();\n reflection = reflectionMatch[1].trim();\n }\n } else if (text.startsWith('Action_Summary:')) {\n const summaryMatch = text.match(\n /Action_Summary: (.+?)(?=\\s*Action[::]|$)/,\n );\n if (summaryMatch) {\n thought = summaryMatch[1].trim();\n }\n }\n\n if (!['Action:', 'Action:'].some((keyword) => text.includes(keyword))) {\n // throw new Error('No Action found in text');\n actionStr = text;\n } else {\n const actionParts = text.split(/Action[::]/);\n actionStr = actionParts[actionParts.length - 1];\n }\n } else if (mode === 'o1') {\n // Parse o1 format\n const thoughtMatch = text.match(/<Thought>\\s*(.*?)\\s*<\\/Thought>/);\n const actionSummaryMatch = text.match(\n /\\nAction_Summary:\\s*(.*?)\\s*Action:/,\n );\n const actionMatch = text.match(/\\nAction:\\s*(.*?)\\s*<\\/Output>/);\n\n const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;\n const actionSummaryContent = actionSummaryMatch\n ? actionSummaryMatch[1]\n : null;\n const actionContent = actionMatch ? actionMatch[1] : null;\n\n thought = `${thoughtContent}\\n<Action_Summary>\\n${actionSummaryContent}`;\n actionStr = actionContent || '';\n }\n\n // Parse actions\n const allActions = actionStr.split('\\n\\n');\n const actions: PredictionParsed[] = [];\n\n for (const rawStr of allActions) {\n // prettier-ignore\n const actionInstance = parseAction(rawStr.replace(/\\n/g, String.raw`\\n`).trimStart());\n let actionType = '';\n let actionInputs: ActionInputs = {};\n\n if (actionInstance) {\n actionType = actionInstance.function;\n const params = actionInstance.args;\n actionInputs = {};\n\n for (const [paramName, param] of Object.entries(params)) {\n if (!param) continue;\n const trimmedParam = (param as string).trim();\n\n if (paramName.includes('start_box') || paramName.includes('end_box')) {\n const oriBox = trimmedParam;\n // Remove parentheses and split\n const numbers = oriBox\n .replace(/[()[\\]]/g, '')\n .split(',')\n .filter((ori) => ori !== '');\n\n // Convert to float and scale\n const floatNumbers = numbers.map((num, idx) => {\n const factorIndex = idx % 2;\n if (modelVer === UITarsModelVersion.V1_5 && smartResizeFactors) {\n return Number.parseFloat(num) / smartResizeFactors[factorIndex];\n }\n return Number.parseFloat(num) / factors[factorIndex];\n });\n\n if (floatNumbers.length === 2) {\n floatNumbers.push(floatNumbers[0], floatNumbers[1]);\n }\n\n actionInputs[\n paramName.trim() as keyof Omit<\n ActionInputs,\n 'start_coords' | 'end_coords'\n >\n ] = JSON.stringify(floatNumbers);\n\n if (screenContext?.width && screenContext?.height) {\n const boxKey = paramName.includes('start_box')\n ? 'start_coords'\n : 'end_coords';\n const [x1, y1, x2 = x1, y2 = y1] = floatNumbers;\n const [widthFactor, heightFactor] = factors;\n\n actionInputs[boxKey] = [x1, y1, x2, y2].every(isNumber)\n ? [\n (Math.round(\n ((x1 + x2) / 2) * screenContext?.width * widthFactor,\n ) /\n widthFactor) *\n (scaleFactor ?? 1),\n (Math.round(\n ((y1 + y2) / 2) * screenContext?.height * heightFactor,\n ) /\n heightFactor) *\n (scaleFactor ?? 1),\n ]\n : [];\n }\n } else {\n actionInputs[\n paramName.trim() as keyof Omit<\n ActionInputs,\n 'start_coords' | 'end_coords'\n >\n ] = trimmedParam;\n }\n }\n }\n\n actions.push({\n reflection: reflection,\n thought: thought || '',\n action_type: actionType,\n action_inputs: actionInputs,\n });\n }\n\n return actions;\n}\n/**\n * Parses an action string into a structured object\n * @param {string} actionStr - The action string to parse (e.g. \"click(start_box='(279,81)')\")\n * @returns {Object|null} Parsed action object or null if parsing fails\n */\nfunction parseAction(actionStr: string) {\n try {\n // Support format: click(start_box='<|box_start|>(x1,y1)<|box_end|>')\n actionStr = actionStr.replace(/<\\|box_start\\|>|<\\|box_end\\|>/g, '');\n\n // Support format: click(point='<point>510 150</point>') => click(start_box='<point>510 150</point>')\n // Support format: drag(start_point='<point>458 328</point>', end_point='<point>350 309</point>') => drag(start_box='<point>458 328</point>', end_box='<point>350 309</point>')\n actionStr = actionStr\n .replace(/(?<!start_|end_)point=/g, 'start_box=')\n .replace(/start_point=/g, 'start_box=')\n .replace(/end_point=/g, 'end_box=');\n\n // Match function name and arguments using regex\n const functionPattern = /^(\\w+)\\((.*)\\)$/;\n const match = actionStr.trim().match(functionPattern);\n\n if (!match) {\n throw new Error('Not a function call');\n }\n\n const [_, functionName, argsStr] = match;\n\n // Parse keyword arguments\n const kwargs = {};\n\n if (argsStr.trim()) {\n // Split on commas that aren't inside quotes or parentheses\n const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];\n\n for (const pair of argPairs) {\n const [key, ...valueParts] = pair.split('=');\n if (!key) continue;\n\n let value = valueParts\n .join('=')\n .trim()\n .replace(/^['\"]|['\"]$/g, ''); // Remove surrounding quotes\n\n // Support format: click(start_box='<bbox>637 964 637 964</bbox>')\n if (value.includes('<bbox>')) {\n value = value.replace(/<bbox>|<\\/bbox>/g, '').replace(/\\s+/g, ',');\n value = `(${value})`;\n }\n\n // Support format: click(point='<point>510 150</point>')\n if (value.includes('<point>')) {\n value = value.replace(/<point>|<\\/point>/g, '').replace(/\\s+/g, ',');\n value = `(${value})`;\n }\n\n //@ts-ignore\n kwargs[key.trim()] = value;\n }\n }\n\n return {\n function: functionName,\n args: kwargs,\n };\n } catch (e) {\n console.error(`Failed to parse action '${actionStr}': ${e}`);\n return null;\n }\n}\n"],"names":["roundByFactor","num","factor","Math","floorByFactor","ceilByFactor","smartResizeForV15","height","width","maxRatio","MAX_RATIO","IMAGE_FACTOR","minPixels","MIN_PIXELS","maxPixels","MAX_PIXELS_V1_5","console","wBar","hBar","beta","actionParser","params","prediction","mode","screenContext","scaleFactor","modelVer","parsed","parseActionVlm","Array","text","factors","UITarsModelVersion","reflection","thought","actionStr","smartResizeFactors","thoughtMatch","reflectionMatch","summaryMatch","keyword","actionParts","actionSummaryMatch","actionMatch","thoughtContent","actionSummaryContent","actionContent","allActions","actions","rawStr","actionInstance","parseAction","String","actionType","actionInputs","paramName","param","Object","trimmedParam","oriBox","numbers","ori","floatNumbers","idx","factorIndex","Number","JSON","boxKey","x1","y1","x2","y2","widthFactor","heightFactor","isNumber","functionPattern","match","Error","_","functionName","argsStr","kwargs","argPairs","pair","key","valueParts","value","e"],"mappings":";;;;;;AAeA,SAASA,cAAcC,GAAW,EAAEC,MAAc;IAChD,OAAOC,KAAK,KAAK,CAACF,MAAMC,UAAUA;AACpC;AAEA,SAASE,cAAcH,GAAW,EAAEC,MAAc;IAChD,OAAOC,KAAK,KAAK,CAACF,MAAMC,UAAUA;AACpC;AAEA,SAASG,aAAaJ,GAAW,EAAEC,MAAc;IAC/C,OAAOC,KAAK,IAAI,CAACF,MAAMC,UAAUA;AACnC;AAEA,SAASI,kBACPC,MAAc,EACdC,KAAa,EACbC,WAAmBC,SAAS,EAC5BR,SAAiBS,YAAY,EAC7BC,YAAoBC,UAAU,EAC9BC,YAAoBC,eAAe;IAEnC,IAAIZ,KAAK,GAAG,CAACI,QAAQC,SAASL,KAAK,GAAG,CAACI,QAAQC,SAASC,UAAU;QAChEO,QAAQ,KAAK,CACX,CAAC,2CAA2C,EAAEP,SAAS,MAAM,EAC3DN,KAAK,GAAG,CAACI,QAAQC,SAASL,KAAK,GAAG,CAACI,QAAQC,QAC3C;QAEJ,OAAO;IACT;IAEA,IAAIS,OAAOd,KAAK,GAAG,CAACD,QAAQF,cAAcQ,OAAON;IACjD,IAAIgB,OAAOf,KAAK,GAAG,CAACD,QAAQF,cAAcO,QAAQL;IAElD,IAAIgB,OAAOD,OAAOH,WAAW;QAC3B,MAAMK,OAAOhB,KAAK,IAAI,CAAEI,SAASC,QAASM;QAC1CI,OAAOd,cAAcG,SAASY,MAAMjB;QACpCe,OAAOb,cAAcI,QAAQW,MAAMjB;IACrC,OAAO,IAAIgB,OAAOD,OAAOL,WAAW;QAClC,MAAMO,OAAOhB,KAAK,IAAI,CAACS,YAAaL,CAAAA,SAASC,KAAI;QACjDU,OAAOb,aAAaE,SAASY,MAAMjB;QACnCe,OAAOZ,aAAaG,QAAQW,MAAMjB;IACpC;IAEA,OAAO;QAACe;QAAMC;KAAK;AACrB;AAEO,SAASE,aAAaC,MAW5B;IAGC,MAAM,EAAEC,UAAU,EAAEpB,MAAM,EAAEqB,IAAI,EAAEC,aAAa,EAAEC,WAAW,EAAEC,QAAQ,EAAE,GACtEL;IAEF,MAAMM,SAASC,eACbN,YACAO,MAAM,OAAO,CAAC3B,UAAUA,SAAS;QAACA;QAAQA;KAAO,EACjDqB,MACAC,eACAC,aACAC;IAGF,OAAO;QACLC;IACF;AACF;AAEO,SAASC,eACdE,IAAY,EACZC,UAA4B;IAAC;IAAM;CAAK,EACxCR,OAAoB,IAAI,EACxBC,aAGC,EACDC,WAAoB,EACpBC,WAA+BM,mBAAmB,IAAI;IAEtD,IAAIC,aAA4B;IAChC,IAAIC,UAAyB;IAC7B,IAAIC,YAAY;IAEhB,IAAIC,qBAA8C;IAClD,IACEV,aAAaM,mBAAmB,IAAI,IACpCR,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,MAAM,AAAD,KACpBA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,KAAK,AAAD,GAEnBY,qBAAqB9B,kBACnBkB,cAAc,MAAM,EACpBA,cAAc,KAAK;IAIvBM,OAAOA,KAAK,IAAI;IAChB,IAAIP,AAAS,SAATA,MAAe;QAEjB,IAAIO,KAAK,QAAQ,CAAC,aAAa;YAC7B,MAAMO,eAAeP,KAAK,KAAK,CAC7B;YAGF,IAAIO,cACFH,UAAUG,YAAY,CAAC,EAAE,CAAC,IAAI;QAElC,OAAO,IAAIP,KAAK,UAAU,CAAC,gBAAgB;YACzC,MAAMQ,kBAAkBR,KAAK,KAAK,CAChC;YAEF,IAAIQ,iBAAiB;gBACnBJ,UAAUI,eAAe,CAAC,EAAE,CAAC,IAAI;gBACjCL,aAAaK,eAAe,CAAC,EAAE,CAAC,IAAI;YACtC;QACF,OAAO,IAAIR,KAAK,UAAU,CAAC,oBAAoB;YAC7C,MAAMS,eAAeT,KAAK,KAAK,CAC7B;YAEF,IAAIS,cACFL,UAAUK,YAAY,CAAC,EAAE,CAAC,IAAI;QAElC;QAEA,IAAK;YAAC;YAAW;SAAU,CAAC,IAAI,CAAC,CAACC,UAAYV,KAAK,QAAQ,CAACU,WAGrD;YACL,MAAMC,cAAcX,KAAK,KAAK,CAAC;YAC/BK,YAAYM,WAAW,CAACA,YAAY,MAAM,GAAG,EAAE;QACjD,OAJEN,YAAYL;IAKhB,OAAO,IAAIP,AAAS,SAATA,MAAe;QAExB,MAAMc,eAAeP,KAAK,KAAK,CAAC;QAChC,MAAMY,qBAAqBZ,KAAK,KAAK,CACnC;QAEF,MAAMa,cAAcb,KAAK,KAAK,CAAC;QAE/B,MAAMc,iBAAiBP,eAAeA,YAAY,CAAC,EAAE,GAAG;QACxD,MAAMQ,uBAAuBH,qBACzBA,kBAAkB,CAAC,EAAE,GACrB;QACJ,MAAMI,gBAAgBH,cAAcA,WAAW,CAAC,EAAE,GAAG;QAErDT,UAAU,GAAGU,eAAe,oBAAoB,EAAEC,sBAAsB;QACxEV,YAAYW,iBAAiB;IAC/B;IAGA,MAAMC,aAAaZ,UAAU,KAAK,CAAC;IACnC,MAAMa,UAA8B,EAAE;IAEtC,KAAK,MAAMC,UAAUF,WAAY;QAE/B,MAAMG,iBAAiBC,YAAYF,OAAO,OAAO,CAAC,OAAOG,OAAO,GAAG,CAAC,EAAE,CAAC,EAAE,SAAS;QAClF,IAAIC,aAAa;QACjB,IAAIC,eAA6B,CAAC;QAElC,IAAIJ,gBAAgB;YAClBG,aAAaH,eAAe,QAAQ;YACpC,MAAM7B,SAAS6B,eAAe,IAAI;YAClCI,eAAe,CAAC;YAEhB,KAAK,MAAM,CAACC,WAAWC,MAAM,IAAIC,OAAO,OAAO,CAACpC,QAAS;gBACvD,IAAI,CAACmC,OAAO;gBACZ,MAAME,eAAgBF,MAAiB,IAAI;gBAE3C,IAAID,UAAU,QAAQ,CAAC,gBAAgBA,UAAU,QAAQ,CAAC,YAAY;oBACpE,MAAMI,SAASD;oBAEf,MAAME,UAAUD,OACb,OAAO,CAAC,YAAY,IACpB,KAAK,CAAC,KACN,MAAM,CAAC,CAACE,MAAQA,AAAQ,OAARA;oBAGnB,MAAMC,eAAeF,QAAQ,GAAG,CAAC,CAAC3D,KAAK8D;wBACrC,MAAMC,cAAcD,MAAM;wBAC1B,IAAIrC,aAAaM,mBAAmB,IAAI,IAAII,oBAC1C,OAAO6B,OAAO,UAAU,CAAChE,OAAOmC,kBAAkB,CAAC4B,YAAY;wBAEjE,OAAOC,OAAO,UAAU,CAAChE,OAAO8B,OAAO,CAACiC,YAAY;oBACtD;oBAEA,IAAIF,AAAwB,MAAxBA,aAAa,MAAM,EACrBA,aAAa,IAAI,CAACA,YAAY,CAAC,EAAE,EAAEA,YAAY,CAAC,EAAE;oBAGpDR,YAAY,CACVC,UAAU,IAAI,GAIf,GAAGW,KAAK,SAAS,CAACJ;oBAEnB,IAAItC,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,KAAK,AAAD,KAAKA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,MAAM,AAAD,GAAG;wBACjD,MAAM2C,SAASZ,UAAU,QAAQ,CAAC,eAC9B,iBACA;wBACJ,MAAM,CAACa,IAAIC,IAAIC,KAAKF,EAAE,EAAEG,KAAKF,EAAE,CAAC,GAAGP;wBACnC,MAAM,CAACU,aAAaC,aAAa,GAAG1C;wBAEpCuB,YAAY,CAACa,OAAO,GAAG;4BAACC;4BAAIC;4BAAIC;4BAAIC;yBAAG,CAAC,KAAK,CAACG,mBAC1C;4BACGvE,KAAK,KAAK,CACPiE,AAAAA,CAAAA,KAAKE,EAAC,IAAK,IAAK9C,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,KAAK,AAAD,IAAIgD,eAEzCA,cACC/C,CAAAA,eAAe;4BACjBtB,KAAK,KAAK,CACPkE,AAAAA,CAAAA,KAAKE,EAAC,IAAK,IAAK/C,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,MAAM,AAAD,IAAIiD,gBAE1CA,eACChD,CAAAA,eAAe;yBACnB,GACD,EAAE;oBACR;gBACF,OACE6B,YAAY,CACVC,UAAU,IAAI,GAIf,GAAGG;YAER;QACF;QAEAV,QAAQ,IAAI,CAAC;YACX,YAAYf;YACZ,SAASC,WAAW;YACpB,aAAamB;YACb,eAAeC;QACjB;IACF;IAEA,OAAON;AACT;AAMA,SAASG,YAAYhB,SAAiB;IACpC,IAAI;QAEFA,YAAYA,UAAU,OAAO,CAAC,kCAAkC;QAIhEA,YAAYA,UACT,OAAO,CAAC,2BAA2B,cACnC,OAAO,CAAC,iBAAiB,cACzB,OAAO,CAAC,eAAe;QAG1B,MAAMwC,kBAAkB;QACxB,MAAMC,QAAQzC,UAAU,IAAI,GAAG,KAAK,CAACwC;QAErC,IAAI,CAACC,OACH,MAAM,IAAIC,MAAM;QAGlB,MAAM,CAACC,GAAGC,cAAcC,QAAQ,GAAGJ;QAGnC,MAAMK,SAAS,CAAC;QAEhB,IAAID,QAAQ,IAAI,IAAI;YAElB,MAAME,WAAWF,QAAQ,KAAK,CAAC,wBAAwB,EAAE;YAEzD,KAAK,MAAMG,QAAQD,SAAU;gBAC3B,MAAM,CAACE,KAAK,GAAGC,WAAW,GAAGF,KAAK,KAAK,CAAC;gBACxC,IAAI,CAACC,KAAK;gBAEV,IAAIE,QAAQD,WACT,IAAI,CAAC,KACL,IAAI,GACJ,OAAO,CAAC,gBAAgB;gBAG3B,IAAIC,MAAM,QAAQ,CAAC,WAAW;oBAC5BA,QAAQA,MAAM,OAAO,CAAC,oBAAoB,IAAI,OAAO,CAAC,QAAQ;oBAC9DA,QAAQ,CAAC,CAAC,EAAEA,MAAM,CAAC,CAAC;gBACtB;gBAGA,IAAIA,MAAM,QAAQ,CAAC,YAAY;oBAC7BA,QAAQA,MAAM,OAAO,CAAC,sBAAsB,IAAI,OAAO,CAAC,QAAQ;oBAChEA,QAAQ,CAAC,CAAC,EAAEA,MAAM,CAAC,CAAC;gBACtB;gBAGAL,MAAM,CAACG,IAAI,IAAI,GAAG,GAAGE;YACvB;QACF;QAEA,OAAO;YACL,UAAUP;YACV,MAAME;QACR;IACF,EAAE,OAAOM,GAAG;QACVvE,QAAQ,KAAK,CAAC,CAAC,wBAAwB,EAAEmB,UAAU,GAAG,EAAEoD,GAAG;QAC3D,OAAO;IACT;AACF"}