UNPKG

@ui-tars/action-parser

Version:
120 lines (119 loc) 3.92 kB
function actionParser(params) { const { prediction, factor } = params; const parsed = parseActionVlm(prediction, factor); return { parsed }; } function parseActionVlm(text, factor = 1e3, mode = "bc") { let reflection = null; let thought = null; let actionStr = ""; text = text.trim(); if (mode === "bc") { if (text.startsWith("Thought:")) { const thoughtMatch = text.match(/Thought: ([\s\S]+?)(?=\s*Action:|$)/); if (thoughtMatch) { thought = thoughtMatch[1].trim(); } } else if (text.startsWith("Reflection:")) { const reflectionMatch = text.match( /Reflection: ([\s\S]+?)Action_Summary: ([\s\S]+?)(?=\s*Action:|$)/ ); if (reflectionMatch) { thought = reflectionMatch[2].trim(); reflection = reflectionMatch[1].trim(); } } else if (text.startsWith("Action_Summary:")) { const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/); if (summaryMatch) { thought = summaryMatch[1].trim(); } } if (!text.includes("Action:")) { actionStr = text; } else { const actionParts = text.split("Action:"); actionStr = actionParts[actionParts.length - 1]; } } else if (mode === "o1") { const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/); const actionSummaryMatch = text.match( /\nAction_Summary:\s*(.*?)\s*Action:/ ); const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/); const thoughtContent = thoughtMatch ? thoughtMatch[1] : null; const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null; const actionContent = actionMatch ? actionMatch[1] : null; thought = `${thoughtContent} <Action_Summary> ${actionSummaryContent}`; actionStr = actionContent || ""; } const allActions = actionStr.split("\n\n"); const actions = []; for (const rawStr of allActions) { const actionInstance = parseAction(rawStr.replace(/\n/g, String.raw`\n`).trimStart()); let actionType = ""; let actionInputs = {}; if (actionInstance) { actionType = actionInstance.function; const params = actionInstance.args; actionInputs = {}; for (const [paramName, param] of Object.entries(params)) { if (!param) continue; const trimmedParam = param.trim(); actionInputs[paramName.trim()] = trimmedParam; if (paramName.includes("start_box") || paramName.includes("end_box")) { const oriBox = trimmedParam; const numbers = oriBox.replace(/[()[\]]/g, "").split(","); const floatNumbers = numbers.map( (num) => Number.parseFloat(num) / factor ); if (floatNumbers.length === 2) { floatNumbers.push(floatNumbers[0], floatNumbers[1]); } actionInputs[paramName.trim()] = JSON.stringify(floatNumbers); } } } actions.push({ reflection, thought: thought || "", action_type: actionType, action_inputs: actionInputs }); } return actions; } function parseAction(actionStr) { try { const functionPattern = /^(\w+)\((.*)\)$/; const match = actionStr.trim().match(functionPattern); if (!match) { throw new Error("Not a function call"); } const [_, functionName, argsStr] = match; const kwargs = {}; if (argsStr.trim()) { const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || []; for (const pair of argPairs) { const [key, ...valueParts] = pair.split("="); if (!key) continue; const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, ""); kwargs[key.trim()] = value; } } return { function: functionName, args: kwargs }; } catch (e) { console.error(`Failed to parse action '${actionStr}': ${e}`); return null; } } export { actionParser, parseActionVlm };