UNPKG

@ui-tars/sdk

Version:

A powerful cross-platform(ANY device/platform) toolkit for building GUI automation agents for UI-TARS

1 lines 15.9 kB
{"version":3,"file":"Model.mjs","sources":["webpack://@ui-tars/sdk/./src/Model.ts"],"sourcesContent":["/*\n * Copyright (c) 2025 Bytedance, Inc. and its affiliates.\n * SPDX-License-Identifier: Apache-2.0\n */\nimport OpenAI, { type ClientOptions } from 'openai';\nimport {\n type ChatCompletionCreateParamsNonStreaming,\n type ChatCompletionCreateParamsBase,\n type ChatCompletionMessageParam,\n} from 'openai/resources/chat/completions';\nimport { actionParser } from '@ui-tars/action-parser';\n\nimport { useContext } from './context/useContext';\nimport { Model, type InvokeParams, type InvokeOutput } from './types';\n\nimport {\n preprocessResizeImage,\n convertToOpenAIMessages,\n convertToResponseApiInput,\n isMessageImage,\n} from './utils';\nimport { DEFAULT_FACTORS } from './constants';\nimport {\n UITarsModelVersion,\n MAX_PIXELS_V1_0,\n MAX_PIXELS_V1_5,\n MAX_PIXELS_DOUBAO,\n} from '@ui-tars/shared/types';\nimport type {\n ResponseCreateParamsNonStreaming,\n ResponseInputItem,\n} from 'openai/resources/responses/responses';\n\ntype OpenAIChatCompletionCreateParams = Omit<ClientOptions, 'maxRetries'> &\n Pick<\n ChatCompletionCreateParamsBase,\n 'model' | 'max_tokens' | 'temperature' | 'top_p'\n >;\n\nexport interface UITarsModelConfig extends OpenAIChatCompletionCreateParams {\n /** Whether to use OpenAI Response API instead of Chat Completions API */\n useResponsesApi?: boolean;\n}\n\nexport interface ThinkingVisionProModelConfig\n extends ChatCompletionCreateParamsNonStreaming {\n thinking?: {\n type: 'enabled' | 'disabled';\n };\n}\n\nexport class UITarsModel extends Model {\n constructor(protected readonly modelConfig: UITarsModelConfig) {\n super();\n this.modelConfig = modelConfig;\n }\n\n get useResponsesApi(): boolean {\n return this.modelConfig.useResponsesApi ?? false;\n }\n private headImageContext: {\n messageIndex: number;\n responseIds: string[];\n } | null = null;\n\n /** [widthFactor, heightFactor] */\n get factors(): [number, number] {\n return DEFAULT_FACTORS;\n }\n\n get modelName(): string {\n return this.modelConfig.model ?? 'unknown';\n }\n\n /**\n * reset the model state\n */\n reset() {\n this.headImageContext = null;\n }\n\n /**\n * call real LLM / VLM Model\n * @param params\n * @param options\n * @returns\n */\n protected async invokeModelProvider(\n uiTarsVersion: UITarsModelVersion = UITarsModelVersion.V1_0,\n params: {\n messages: Array<ChatCompletionMessageParam>;\n previousResponseId?: string;\n },\n options: {\n signal?: AbortSignal;\n },\n headers?: Record<string, string>,\n ): Promise<{\n prediction: string;\n costTime?: number;\n costTokens?: number;\n responseId?: string;\n }> {\n const { logger } = useContext();\n const { messages, previousResponseId } = params;\n const {\n baseURL,\n apiKey,\n model,\n max_tokens = uiTarsVersion == UITarsModelVersion.V1_5 ? 65535 : 1000,\n temperature = 0,\n top_p = 0.7,\n ...restOptions\n } = this.modelConfig;\n\n const openai = new OpenAI({\n ...restOptions,\n maxRetries: 0,\n baseURL,\n apiKey,\n });\n\n const createCompletionPrams: ChatCompletionCreateParamsNonStreaming = {\n model,\n messages,\n stream: false,\n seed: null,\n stop: null,\n frequency_penalty: null,\n presence_penalty: null,\n // custom options\n max_tokens,\n temperature,\n top_p,\n };\n\n const createCompletionPramsThinkingVp: ThinkingVisionProModelConfig = {\n ...createCompletionPrams,\n thinking: {\n type: 'disabled',\n },\n };\n\n const startTime = Date.now();\n\n if (this.modelConfig.useResponsesApi) {\n const lastAssistantIndex = messages.findLastIndex(\n (c) => c.role === 'assistant',\n );\n logger.info('[ResponseAPI] lastAssistantIndex: ', lastAssistantIndex);\n // incremental messages\n const inputs = convertToResponseApiInput(\n lastAssistantIndex > -1\n ? messages.slice(lastAssistantIndex + 1)\n : messages,\n );\n\n // find the first image message\n const headImageMessageIndex = messages.findIndex(isMessageImage);\n if (\n this.headImageContext?.responseIds.length &&\n this.headImageContext?.messageIndex !== headImageMessageIndex\n ) {\n // The image window has slid. Delete the first image message.\n logger.info(\n '[ResponseAPI] should [delete]: ',\n this.headImageContext,\n 'headImageMessageIndex',\n headImageMessageIndex,\n );\n const headImageResponseId = this.headImageContext.responseIds.shift();\n\n if (headImageResponseId) {\n const deletedResponse = await openai.responses.delete(\n headImageResponseId,\n {\n headers,\n },\n );\n logger.info(\n '[ResponseAPI] [deletedResponse]: ',\n headImageResponseId,\n deletedResponse,\n );\n }\n }\n\n let result;\n let responseId = previousResponseId;\n for (const input of inputs) {\n const truncated = JSON.stringify(\n [input],\n (key, value) => {\n if (typeof value === 'string' && value.startsWith('data:image/')) {\n return value.slice(0, 50) + '...[truncated]';\n }\n return value;\n },\n 2,\n );\n const responseParams: ResponseCreateParamsNonStreaming = {\n input: [input],\n model,\n temperature,\n top_p,\n stream: false,\n max_output_tokens: max_tokens,\n ...(responseId && {\n previous_response_id: responseId,\n }),\n // @ts-expect-error\n thinking: {\n type: 'disabled',\n },\n };\n logger.info(\n '[ResponseAPI] [input]: ',\n truncated,\n 'previous_response_id',\n responseParams?.previous_response_id,\n 'headImageMessageIndex',\n headImageMessageIndex,\n );\n\n result = await openai.responses.create(responseParams, {\n ...options,\n timeout: 1000 * 30,\n headers,\n });\n logger.info('[ResponseAPI] [result]: ', result);\n responseId = result?.id;\n logger.info('[ResponseAPI] [responseId]: ', responseId);\n\n // head image changed\n if (responseId && isMessageImage(input)) {\n this.headImageContext = {\n messageIndex: headImageMessageIndex,\n responseIds: [\n ...(this.headImageContext?.responseIds || []),\n responseId,\n ],\n };\n }\n\n logger.info(\n '[ResponseAPI] [headImageContext]: ',\n this.headImageContext,\n );\n }\n\n return {\n prediction: result?.output_text ?? '',\n costTime: Date.now() - startTime,\n costTokens: result?.usage?.total_tokens ?? 0,\n responseId,\n };\n }\n\n // Use Chat Completions API if not using Response API\n const result = await openai.chat.completions.create(\n createCompletionPramsThinkingVp,\n {\n ...options,\n timeout: 1000 * 30,\n headers,\n },\n );\n\n return {\n prediction: result.choices?.[0]?.message?.content ?? '',\n costTime: Date.now() - startTime,\n costTokens: result.usage?.total_tokens ?? 0,\n };\n }\n\n async invoke(params: InvokeParams): Promise<InvokeOutput> {\n const {\n conversations,\n images,\n screenContext,\n scaleFactor,\n uiTarsVersion,\n headers,\n previousResponseId,\n } = params;\n const { logger, signal } = useContext();\n\n logger?.info(\n `[UITarsModel] invoke: screenContext=${JSON.stringify(screenContext)}, scaleFactor=${scaleFactor}, uiTarsVersion=${uiTarsVersion}, useResponsesApi=${this.modelConfig.useResponsesApi}`,\n );\n\n const maxPixels =\n uiTarsVersion === UITarsModelVersion.V1_5\n ? MAX_PIXELS_V1_5\n : uiTarsVersion === UITarsModelVersion.DOUBAO_1_5_15B ||\n uiTarsVersion === UITarsModelVersion.DOUBAO_1_5_20B\n ? MAX_PIXELS_DOUBAO\n : MAX_PIXELS_V1_0;\n const compressedImages = await Promise.all(\n images.map((image) => preprocessResizeImage(image, maxPixels)),\n );\n\n const messages = convertToOpenAIMessages({\n conversations,\n images: compressedImages,\n });\n\n const startTime = Date.now();\n const result = await this.invokeModelProvider(\n uiTarsVersion,\n {\n messages,\n previousResponseId,\n },\n {\n signal,\n },\n headers,\n )\n .catch((e) => {\n logger?.error('[UITarsModel] error', e);\n throw e;\n })\n .finally(() => {\n logger?.info(`[UITarsModel cost]: ${Date.now() - startTime}ms`);\n });\n\n if (!result.prediction) {\n const err = new Error();\n err.name = 'vlm response error';\n err.stack = JSON.stringify(result) ?? 'no message';\n logger?.error(err);\n throw err;\n }\n\n const { prediction, costTime, costTokens, responseId } = result;\n\n try {\n const { parsed: parsedPredictions } = actionParser({\n prediction,\n factor: this.factors,\n screenContext,\n scaleFactor,\n modelVer: uiTarsVersion,\n });\n return {\n prediction,\n parsedPredictions,\n costTime,\n costTokens,\n responseId,\n };\n } catch (error) {\n logger?.error('[UITarsModel] error', error);\n return {\n prediction,\n parsedPredictions: [],\n responseId,\n };\n }\n }\n}\n"],"names":["UITarsModel","Model","DEFAULT_FACTORS","uiTarsVersion","UITarsModelVersion","params","options","headers","_result_choices__message","_result_usage","logger","useContext","messages","previousResponseId","baseURL","apiKey","model","max_tokens","temperature","top_p","restOptions","openai","OpenAI","createCompletionPrams","createCompletionPramsThinkingVp","startTime","Date","_this_headImageContext","_this_headImageContext1","_result_usage1","lastAssistantIndex","c","inputs","convertToResponseApiInput","headImageMessageIndex","isMessageImage","headImageResponseId","deletedResponse","result","responseId","input","truncated","JSON","key","value","responseParams","_this_headImageContext2","conversations","images","screenContext","scaleFactor","signal","maxPixels","MAX_PIXELS_V1_5","MAX_PIXELS_DOUBAO","MAX_PIXELS_V1_0","compressedImages","Promise","image","preprocessResizeImage","convertToOpenAIMessages","e","err","Error","prediction","costTime","costTokens","parsedPredictions","actionParser","error","modelConfig"],"mappings":";;;;;;;;;;;AAGC;;;;;;;;;;AAgDM,MAAMA,oBAAoBC;IAM/B,IAAI,kBAA2B;QAC7B,OAAO,IAAI,CAAC,WAAW,CAAC,eAAe,IAAI;IAC7C;IAOA,IAAI,UAA4B;QAC9B,OAAOC;IACT;IAEA,IAAI,YAAoB;QACtB,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,IAAI;IACnC;IAKA,QAAQ;QACN,IAAI,CAAC,gBAAgB,GAAG;IAC1B;IAQA,MAAgB,oBACdC,gBAAoCC,mBAAmB,IAAI,EAC3DC,MAGC,EACDC,OAEC,EACDC,OAAgC,EAM/B;YAuKaC,0BAAAA,kBAAAA,iBAEAC;QAxKd,MAAM,EAAEC,MAAM,EAAE,GAAGC;QACnB,MAAM,EAAEC,QAAQ,EAAEC,kBAAkB,EAAE,GAAGR;QACzC,MAAM,EACJS,OAAO,EACPC,MAAM,EACNC,KAAK,EACLC,aAAad,iBAAiBC,mBAAmB,IAAI,GAAG,QAAQ,IAAI,EACpEc,cAAc,CAAC,EACfC,QAAQ,GAAG,EACX,GAAGC,aACJ,GAAG,IAAI,CAAC,WAAW;QAEpB,MAAMC,SAAS,IAAIC,SAAO;YACxB,GAAGF,WAAW;YACd,YAAY;YACZN;YACAC;QACF;QAEA,MAAMQ,wBAAgE;YACpEP;YACAJ;YACA,QAAQ;YACR,MAAM;YACN,MAAM;YACN,mBAAmB;YACnB,kBAAkB;YAElBK;YACAC;YACAC;QACF;QAEA,MAAMK,kCAAgE;YACpE,GAAGD,qBAAqB;YACxB,UAAU;gBACR,MAAM;YACR;QACF;QAEA,MAAME,YAAYC,KAAK,GAAG;QAE1B,IAAI,IAAI,CAAC,WAAW,CAAC,eAAe,EAAE;gBAelCC,wBACAC,yBA4FYC;YA3Gd,MAAMC,qBAAqBlB,SAAS,aAAa,CAC/C,CAACmB,IAAMA,AAAW,gBAAXA,EAAE,IAAI;YAEfrB,OAAO,IAAI,CAAC,sCAAsCoB;YAElD,MAAME,SAASC,0BACbH,qBAAqB,KACjBlB,SAAS,KAAK,CAACkB,qBAAqB,KACpClB;YAIN,MAAMsB,wBAAwBtB,SAAS,SAAS,CAACuB;YACjD,IACER,AAAAA,SAAAA,CAAAA,yBAAAA,IAAI,CAAC,gBAAgB,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,WAAW,CAAC,MAAM,AAAD,KACxCC,AAAAA,SAAAA,CAAAA,0BAAAA,IAAI,CAAC,gBAAgB,AAAD,IAApBA,KAAAA,IAAAA,wBAAuB,YAAY,AAAD,MAAMM,uBACxC;gBAEAxB,OAAO,IAAI,CACT,mCACA,IAAI,CAAC,gBAAgB,EACrB,yBACAwB;gBAEF,MAAME,sBAAsB,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,KAAK;gBAEnE,IAAIA,qBAAqB;oBACvB,MAAMC,kBAAkB,MAAMhB,OAAO,SAAS,CAAC,MAAM,CACnDe,qBACA;wBACE7B;oBACF;oBAEFG,OAAO,IAAI,CACT,qCACA0B,qBACAC;gBAEJ;YACF;YAEA,IAAIC;YACJ,IAAIC,aAAa1B;YACjB,KAAK,MAAM2B,SAASR,OAAQ;gBAC1B,MAAMS,YAAYC,KAAK,SAAS,CAC9B;oBAACF;iBAAM,EACP,CAACG,KAAKC;oBACJ,IAAI,AAAiB,YAAjB,OAAOA,SAAsBA,MAAM,UAAU,CAAC,gBAChD,OAAOA,MAAM,KAAK,CAAC,GAAG,MAAM;oBAE9B,OAAOA;gBACT,GACA;gBAEF,MAAMC,iBAAmD;oBACvD,OAAO;wBAACL;qBAAM;oBACdxB;oBACAE;oBACAC;oBACA,QAAQ;oBACR,mBAAmBF;oBACnB,GAAIsB,cAAc;wBAChB,sBAAsBA;oBACxB,CAAC;oBAED,UAAU;wBACR,MAAM;oBACR;gBACF;gBACA7B,OAAO,IAAI,CACT,2BACA+B,WACA,wBACAI,QAAAA,iBAAAA,KAAAA,IAAAA,eAAgB,oBAAoB,EACpC,yBACAX;gBAGFI,SAAS,MAAMjB,OAAO,SAAS,CAAC,MAAM,CAACwB,gBAAgB;oBACrD,GAAGvC,OAAO;oBACV,SAAS;oBACTC;gBACF;gBACAG,OAAO,IAAI,CAAC,4BAA4B4B;gBACxCC,aAAaD,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,EAAE;gBACvB5B,OAAO,IAAI,CAAC,gCAAgC6B;gBAG5C,IAAIA,cAAcJ,eAAeK,QAAQ;wBAI/BM;oBAHR,IAAI,CAAC,gBAAgB,GAAG;wBACtB,cAAcZ;wBACd,aAAa;+BACPY,AAAAA,SAAAA,CAAAA,0BAAAA,IAAI,CAAC,gBAAgB,AAAD,IAApBA,KAAAA,IAAAA,wBAAuB,WAAW,AAAD,KAAK,EAAE;4BAC5CP;yBACD;oBACH;gBACF;gBAEA7B,OAAO,IAAI,CACT,sCACA,IAAI,CAAC,gBAAgB;YAEzB;YAEA,OAAO;gBACL,YAAY4B,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,WAAW,AAAD,KAAK;gBACnC,UAAUZ,KAAK,GAAG,KAAKD;gBACvB,YAAYI,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,QAAAA,CAAAA,iBAAAA,OAAQ,KAAK,AAAD,IAAZA,KAAAA,IAAAA,eAAe,YAAY,AAAD,KAAK;gBAC3CU;YACF;QACF;QAGA,MAAMD,SAAS,MAAMjB,OAAO,IAAI,CAAC,WAAW,CAAC,MAAM,CACjDG,iCACA;YACE,GAAGlB,OAAO;YACV,SAAS;YACTC;QACF;QAGF,OAAO;YACL,YAAYC,AAAAA,SAAAA,CAAAA,kBAAAA,OAAO,OAAO,AAAD,IAAbA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAgB,CAAC,EAAE,AAAD,IAAlBA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAqB,OAAO,AAAD,IAA3BA,KAAAA,IAAAA,yBAA8B,OAAO,AAAD,KAAK;YACrD,UAAUkB,KAAK,GAAG,KAAKD;YACvB,YAAYhB,AAAAA,SAAAA,CAAAA,gBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,cAAc,YAAY,AAAD,KAAK;QAC5C;IACF;IAEA,MAAM,OAAOJ,MAAoB,EAAyB;QACxD,MAAM,EACJ0C,aAAa,EACbC,MAAM,EACNC,aAAa,EACbC,WAAW,EACX/C,aAAa,EACbI,OAAO,EACPM,kBAAkB,EACnB,GAAGR;QACJ,MAAM,EAAEK,MAAM,EAAEyC,MAAM,EAAE,GAAGxC;QAE3BD,QAAAA,UAAAA,OAAQ,IAAI,CACV,CAAC,oCAAoC,EAAEgC,KAAK,SAAS,CAACO,eAAe,cAAc,EAAEC,YAAY,gBAAgB,EAAE/C,cAAc,kBAAkB,EAAE,IAAI,CAAC,WAAW,CAAC,eAAe,EAAE;QAGzL,MAAMiD,YACJjD,kBAAkBC,mBAAmB,IAAI,GACrCiD,kBACAlD,kBAAkBC,mBAAmB,cAAc,IACjDD,kBAAkBC,mBAAmB,cAAc,GACnDkD,oBACAC;QACR,MAAMC,mBAAmB,MAAMC,QAAQ,GAAG,CACxCT,OAAO,GAAG,CAAC,CAACU,QAAUC,sBAAsBD,OAAON;QAGrD,MAAMxC,WAAWgD,wBAAwB;YACvCb;YACA,QAAQS;QACV;QAEA,MAAM/B,YAAYC,KAAK,GAAG;QAC1B,MAAMY,SAAS,MAAM,IAAI,CAAC,mBAAmB,CAC3CnC,eACA;YACES;YACAC;QACF,GACA;YACEsC;QACF,GACA5C,SAEC,KAAK,CAAC,CAACsD;YACNnD,QAAAA,UAAAA,OAAQ,KAAK,CAAC,uBAAuBmD;YACrC,MAAMA;QACR,GACC,OAAO,CAAC;YACPnD,QAAAA,UAAAA,OAAQ,IAAI,CAAC,CAAC,oBAAoB,EAAEgB,KAAK,GAAG,KAAKD,UAAU,EAAE,CAAC;QAChE;QAEF,IAAI,CAACa,OAAO,UAAU,EAAE;YACtB,MAAMwB,MAAM,IAAIC;YAChBD,IAAI,IAAI,GAAG;YACXA,IAAI,KAAK,GAAGpB,KAAK,SAAS,CAACJ,WAAW;YACtC5B,QAAAA,UAAAA,OAAQ,KAAK,CAACoD;YACd,MAAMA;QACR;QAEA,MAAM,EAAEE,UAAU,EAAEC,QAAQ,EAAEC,UAAU,EAAE3B,UAAU,EAAE,GAAGD;QAEzD,IAAI;YACF,MAAM,EAAE,QAAQ6B,iBAAiB,EAAE,GAAGC,aAAa;gBACjDJ;gBACA,QAAQ,IAAI,CAAC,OAAO;gBACpBf;gBACAC;gBACA,UAAU/C;YACZ;YACA,OAAO;gBACL6D;gBACAG;gBACAF;gBACAC;gBACA3B;YACF;QACF,EAAE,OAAO8B,OAAO;YACd3D,QAAAA,UAAAA,OAAQ,KAAK,CAAC,uBAAuB2D;YACrC,OAAO;gBACLL;gBACA,mBAAmB,EAAE;gBACrBzB;YACF;QACF;IACF;IApTA,YAA+B+B,WAA8B,CAAE;QAC7D,KAAK,mDAOP,uBAAQ,oBAAR,cAR+BA,WAAW,GAAXA,aAAAA,IAAAA,CAQvB,gBAAgB,GAGb;QATT,IAAI,CAAC,WAAW,GAAGA;IACrB;AAkTF"}