@langchain/openai
Version:
OpenAI integrations for LangChain.js
1 lines • 14.7 kB
Source Map (JSON)
{"version":3,"file":"computerUse.cjs","names":["z","options: ComputerUseOptions","input: ComputerUseInput","runtime: ToolRuntime<{ messages: BaseMessage[] }>","ToolMessage"],"sources":["../../src/tools/computerUse.ts"],"sourcesContent":["/* eslint-disable @typescript-eslint/no-explicit-any */\nimport { z } from \"zod/v4\";\nimport { OpenAI as OpenAIClient } from \"openai\";\nimport { tool, type DynamicStructuredTool } from \"@langchain/core/tools\";\nimport { type ToolRuntime } from \"@langchain/core/tools\";\nimport {\n ToolMessage,\n type AIMessage,\n type BaseMessage,\n} from \"@langchain/core/messages\";\n\n/**\n * The type of computer environment to control.\n */\nexport type ComputerUseEnvironment =\n | \"browser\"\n | \"mac\"\n | \"windows\"\n | \"linux\"\n | \"ubuntu\";\n\n/**\n * Re-export action types from OpenAI SDK for convenience.\n */\nexport type ComputerUseClickAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Click;\nexport type ComputerUseDoubleClickAction =\n OpenAIClient.Responses.ResponseComputerToolCall.DoubleClick;\nexport type ComputerUseDragAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Drag;\nexport type ComputerUseKeypressAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Keypress;\nexport type ComputerUseMoveAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Move;\nexport type ComputerUseScreenshotAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Screenshot;\nexport type ComputerUseScrollAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Scroll;\nexport type ComputerUseTypeAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Type;\nexport type ComputerUseWaitAction =\n OpenAIClient.Responses.ResponseComputerToolCall.Wait;\n\n/**\n * Union type of all computer use actions from OpenAI SDK.\n */\nexport type ComputerUseAction =\n OpenAIClient.Responses.ResponseComputerToolCall[\"action\"];\n\n// Zod schemas for computer use actions\nconst ComputerUseScreenshotActionSchema = z.object({\n type: z.literal(\"screenshot\"),\n});\n\nconst ComputerUseClickActionSchema = z.object({\n type: z.literal(\"click\"),\n x: z.number(),\n y: z.number(),\n button: z.enum([\"left\", \"right\", \"wheel\", \"back\", \"forward\"]).default(\"left\"),\n});\n\nconst ComputerUseDoubleClickActionSchema = z.object({\n type: z.literal(\"double_click\"),\n x: z.number(),\n y: z.number(),\n button: z.enum([\"left\", \"right\", \"wheel\", \"back\", \"forward\"]).default(\"left\"),\n});\n\nconst ComputerUseDragActionSchema = z.object({\n type: z.literal(\"drag\"),\n path: z.array(z.object({ x: z.number(), y: z.number() })),\n});\n\nconst ComputerUseKeypressActionSchema = z.object({\n type: z.literal(\"keypress\"),\n keys: z.array(z.string()),\n});\n\nconst ComputerUseMoveActionSchema = z.object({\n type: z.literal(\"move\"),\n x: z.number(),\n y: z.number(),\n});\n\nconst ComputerUseScrollActionSchema = z.object({\n type: z.literal(\"scroll\"),\n x: z.number(),\n y: z.number(),\n scroll_x: z.number(),\n scroll_y: z.number(),\n});\n\nconst ComputerUseTypeActionSchema = z.object({\n type: z.literal(\"type\"),\n text: z.string(),\n});\n\nconst ComputerUseWaitActionSchema = z.object({\n type: z.literal(\"wait\"),\n duration: z.number().optional(),\n});\n\n// Discriminated union schema for individual action types\nconst ComputerUseActionUnionSchema = z.discriminatedUnion(\"type\", [\n ComputerUseScreenshotActionSchema,\n ComputerUseClickActionSchema,\n ComputerUseDoubleClickActionSchema,\n ComputerUseDragActionSchema,\n ComputerUseKeypressActionSchema,\n ComputerUseMoveActionSchema,\n ComputerUseScrollActionSchema,\n ComputerUseTypeActionSchema,\n ComputerUseWaitActionSchema,\n]);\n\n// Schema for the input structure received from parseComputerCall\n// The action is wrapped in an `action` property: { action: { type: 'screenshot' } }\nexport const ComputerUseActionSchema = z.object({\n action: ComputerUseActionUnionSchema,\n});\n\n// TypeScript types derived from Zod schemas\nexport type ComputerUseScreenshotActionType = z.infer<\n typeof ComputerUseScreenshotActionSchema\n>;\nexport type ComputerUseClickActionType = z.infer<\n typeof ComputerUseClickActionSchema\n>;\nexport type ComputerUseDoubleClickActionType = z.infer<\n typeof ComputerUseDoubleClickActionSchema\n>;\nexport type ComputerUseDragActionType = z.infer<\n typeof ComputerUseDragActionSchema\n>;\nexport type ComputerUseKeypressActionType = z.infer<\n typeof ComputerUseKeypressActionSchema\n>;\nexport type ComputerUseMoveActionType = z.infer<\n typeof ComputerUseMoveActionSchema\n>;\nexport type ComputerUseScrollActionType = z.infer<\n typeof ComputerUseScrollActionSchema\n>;\nexport type ComputerUseTypeActionType = z.infer<\n typeof ComputerUseTypeActionSchema\n>;\nexport type ComputerUseWaitActionType = z.infer<\n typeof ComputerUseWaitActionSchema\n>;\n\n/**\n * Input structure for the Computer Use tool.\n * The action is wrapped in an `action` property.\n */\nexport interface ComputerUseInput {\n action: ComputerUseAction;\n}\n\nexport type ComputerUseReturnType =\n | string\n | Promise<string>\n | ToolMessage<any>\n | Promise<ToolMessage<any>>;\n\n/**\n * Options for the Computer Use tool.\n */\nexport interface ComputerUseOptions {\n /**\n * The width of the computer display in pixels.\n */\n displayWidth: number;\n\n /**\n * The height of the computer display in pixels.\n */\n displayHeight: number;\n\n /**\n * The type of computer environment to control.\n * - `browser`: Browser automation (recommended for most use cases)\n * - `mac`: macOS environment\n * - `windows`: Windows environment\n * - `linux`: Linux environment\n * - `ubuntu`: Ubuntu environment\n */\n environment: ComputerUseEnvironment;\n\n /**\n * Execute function that handles computer action execution.\n * This function receives the action input and should return a base64-encoded\n * screenshot of the result.\n */\n execute: (\n action: ComputerUseAction,\n runtime: ToolRuntime<any, any>\n ) => ComputerUseReturnType;\n}\n\n/**\n * OpenAI Computer Use tool type for the Responses API.\n */\nexport type ComputerUseTool = OpenAIClient.Responses.ComputerTool;\n\nconst TOOL_NAME = \"computer_use\";\n\n/**\n * Creates a Computer Use tool that allows models to control computer interfaces\n * and perform tasks by simulating mouse clicks, keyboard input, scrolling, and more.\n *\n * **Computer Use** is a practical application of OpenAI's Computer-Using Agent (CUA)\n * model (`computer-use-preview`), which combines vision capabilities with advanced\n * reasoning to simulate controlling computer interfaces.\n *\n * **How it works**:\n * The tool operates in a continuous loop:\n * 1. Model sends computer actions (click, type, scroll, etc.)\n * 2. Your code executes these actions in a controlled environment\n * 3. You capture a screenshot of the result\n * 4. Send the screenshot back to the model\n * 5. Repeat until the task is complete\n *\n * **Important**: Computer use is in beta and requires careful consideration:\n * - Use in sandboxed environments only\n * - Do not use for high-stakes or authenticated tasks\n * - Always implement human-in-the-loop for important decisions\n * - Handle safety checks appropriately\n *\n * @see {@link https://platform.openai.com/docs/guides/tools-computer-use | OpenAI Computer Use Documentation}\n *\n * @param options - Configuration options for the Computer Use tool\n * @returns A Computer Use tool that can be passed to `bindTools`\n *\n * @example\n * ```typescript\n * import { ChatOpenAI, tools } from \"@langchain/openai\";\n *\n * const model = new ChatOpenAI({ model: \"computer-use-preview\" });\n *\n * // With execute callback for automatic action handling\n * const computer = tools.computerUse({\n * displayWidth: 1024,\n * displayHeight: 768,\n * environment: \"browser\",\n * execute: async (action) => {\n * if (action.type === \"screenshot\") {\n * return captureScreenshot();\n * }\n * if (action.type === \"click\") {\n * await page.mouse.click(action.x, action.y, { button: action.button });\n * return captureScreenshot();\n * }\n * if (action.type === \"type\") {\n * await page.keyboard.type(action.text);\n * return captureScreenshot();\n * }\n * // Handle other actions...\n * return captureScreenshot();\n * },\n * });\n *\n * const llmWithComputer = model.bindTools([computer]);\n * const response = await llmWithComputer.invoke(\n * \"Check the latest news on bing.com\"\n * );\n * ```\n *\n * @example\n * ```typescript\n * // Without execute callback (manual action handling)\n * const computer = tools.computerUse({\n * displayWidth: 1024,\n * displayHeight: 768,\n * environment: \"browser\",\n * });\n *\n * const response = await model.invoke(\"Check the news\", {\n * tools: [computer],\n * });\n *\n * // Access the computer call from the response\n * const computerCall = response.additional_kwargs.tool_outputs?.find(\n * (output) => output.type === \"computer_call\"\n * );\n * if (computerCall) {\n * console.log(\"Action to execute:\", computerCall.action);\n * // Execute the action manually, then send back a screenshot\n * }\n * ```\n *\n * @example\n * ```typescript\n * // For macOS desktop automation with Docker\n * const computer = tools.computerUse({\n * displayWidth: 1920,\n * displayHeight: 1080,\n * environment: \"mac\",\n * execute: async (action) => {\n * if (action.type === \"click\") {\n * await dockerExec(\n * `DISPLAY=:99 xdotool mousemove ${action.x} ${action.y} click 1`,\n * containerName\n * );\n * }\n * // Capture screenshot from container\n * return await getDockerScreenshot(containerName);\n * },\n * });\n * ```\n *\n * @remarks\n * - Only available through the Responses API (not Chat Completions)\n * - Requires `computer-use-preview` model\n * - Actions include: click, double_click, drag, keypress, move, screenshot, scroll, type, wait\n * - Safety checks may be returned that require acknowledgment before proceeding\n * - Use `truncation: \"auto\"` parameter when making requests\n * - Recommended to use with `reasoning.summary` for debugging\n */\nexport function computerUse(options: ComputerUseOptions) {\n const computerTool = tool(\n async (\n input: ComputerUseInput,\n runtime: ToolRuntime<{ messages: BaseMessage[] }>\n ) => {\n /**\n * get computer_use call id from runtime\n */\n const aiMessage = runtime.state?.messages.at(-1) as AIMessage | undefined;\n const computerToolCall = aiMessage?.tool_calls?.find(\n (tc) => tc.name === \"computer_use\"\n );\n const computerToolCallId = computerToolCall?.id;\n if (!computerToolCallId) {\n throw new Error(\"Computer use call id not found\");\n }\n\n const result = await options.execute(input.action, runtime);\n\n /**\n * make sure {@link ToolMessage} is returned with the correct additional kwargs\n */\n if (typeof result === \"string\") {\n return new ToolMessage({\n content: result,\n tool_call_id: computerToolCallId,\n additional_kwargs: {\n type: \"computer_call_output\",\n },\n });\n }\n\n /**\n * make sure {@link ToolMessage} is returned with the correct additional kwargs\n */\n return new ToolMessage({\n ...result,\n tool_call_id: computerToolCallId,\n additional_kwargs: {\n type: \"computer_call_output\",\n ...result.additional_kwargs,\n },\n });\n },\n {\n name: TOOL_NAME,\n description:\n \"Control a computer interface by executing mouse clicks, keyboard input, scrolling, and other actions.\",\n schema: ComputerUseActionSchema,\n }\n );\n\n computerTool.extras = {\n ...(computerTool.extras ?? {}),\n providerToolDefinition: {\n type: \"computer_use_preview\",\n display_width: options.displayWidth,\n display_height: options.displayHeight,\n environment: options.environment,\n } satisfies ComputerUseTool,\n };\n\n /**\n * return as typed {@link DynamicStructuredTool} so we don't get any type\n * errors like \"can't export tool without reference\"\n */\n return computerTool as DynamicStructuredTool<\n typeof ComputerUseActionSchema,\n ComputerUseInput,\n unknown,\n ComputerUseReturnType\n >;\n}\n"],"mappings":";;;;;;AAkDA,MAAM,oCAAoCA,SAAE,OAAO,EACjD,MAAMA,SAAE,QAAQ,aAAa,CAC9B,EAAC;AAEF,MAAM,+BAA+BA,SAAE,OAAO;CAC5C,MAAMA,SAAE,QAAQ,QAAQ;CACxB,GAAGA,SAAE,QAAQ;CACb,GAAGA,SAAE,QAAQ;CACb,QAAQA,SAAE,KAAK;EAAC;EAAQ;EAAS;EAAS;EAAQ;CAAU,EAAC,CAAC,QAAQ,OAAO;AAC9E,EAAC;AAEF,MAAM,qCAAqCA,SAAE,OAAO;CAClD,MAAMA,SAAE,QAAQ,eAAe;CAC/B,GAAGA,SAAE,QAAQ;CACb,GAAGA,SAAE,QAAQ;CACb,QAAQA,SAAE,KAAK;EAAC;EAAQ;EAAS;EAAS;EAAQ;CAAU,EAAC,CAAC,QAAQ,OAAO;AAC9E,EAAC;AAEF,MAAM,8BAA8BA,SAAE,OAAO;CAC3C,MAAMA,SAAE,QAAQ,OAAO;CACvB,MAAMA,SAAE,MAAMA,SAAE,OAAO;EAAE,GAAGA,SAAE,QAAQ;EAAE,GAAGA,SAAE,QAAQ;CAAE,EAAC,CAAC;AAC1D,EAAC;AAEF,MAAM,kCAAkCA,SAAE,OAAO;CAC/C,MAAMA,SAAE,QAAQ,WAAW;CAC3B,MAAMA,SAAE,MAAMA,SAAE,QAAQ,CAAC;AAC1B,EAAC;AAEF,MAAM,8BAA8BA,SAAE,OAAO;CAC3C,MAAMA,SAAE,QAAQ,OAAO;CACvB,GAAGA,SAAE,QAAQ;CACb,GAAGA,SAAE,QAAQ;AACd,EAAC;AAEF,MAAM,gCAAgCA,SAAE,OAAO;CAC7C,MAAMA,SAAE,QAAQ,SAAS;CACzB,GAAGA,SAAE,QAAQ;CACb,GAAGA,SAAE,QAAQ;CACb,UAAUA,SAAE,QAAQ;CACpB,UAAUA,SAAE,QAAQ;AACrB,EAAC;AAEF,MAAM,8BAA8BA,SAAE,OAAO;CAC3C,MAAMA,SAAE,QAAQ,OAAO;CACvB,MAAMA,SAAE,QAAQ;AACjB,EAAC;AAEF,MAAM,8BAA8BA,SAAE,OAAO;CAC3C,MAAMA,SAAE,QAAQ,OAAO;CACvB,UAAUA,SAAE,QAAQ,CAAC,UAAU;AAChC,EAAC;AAGF,MAAM,+BAA+BA,SAAE,mBAAmB,QAAQ;CAChE;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;AACD,EAAC;AAIF,MAAa,0BAA0BA,SAAE,OAAO,EAC9C,QAAQ,6BACT,EAAC;AAqFF,MAAM,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAkHlB,SAAgB,YAAYC,SAA6B;CACvD,MAAM,gDACJ,OACEC,OACAC,YACG;;;;EAIH,MAAM,YAAY,QAAQ,OAAO,SAAS,GAAG,GAAG;EAChD,MAAM,mBAAmB,WAAW,YAAY,KAC9C,CAAC,OAAO,GAAG,SAAS,eACrB;EACD,MAAM,qBAAqB,kBAAkB;AAC7C,MAAI,CAAC,mBACH,OAAM,IAAI,MAAM;EAGlB,MAAM,SAAS,MAAM,QAAQ,QAAQ,MAAM,QAAQ,QAAQ;;;;AAK3D,MAAI,OAAO,WAAW,SACpB,QAAO,IAAIC,sCAAY;GACrB,SAAS;GACT,cAAc;GACd,mBAAmB,EACjB,MAAM,uBACP;EACF;;;;AAMH,SAAO,IAAIA,sCAAY;GACrB,GAAG;GACH,cAAc;GACd,mBAAmB;IACjB,MAAM;IACN,GAAG,OAAO;GACX;EACF;CACF,GACD;EACE,MAAM;EACN,aACE;EACF,QAAQ;CACT,EACF;CAED,aAAa,SAAS;EACpB,GAAI,aAAa,UAAU,CAAE;EAC7B,wBAAwB;GACtB,MAAM;GACN,eAAe,QAAQ;GACvB,gBAAgB,QAAQ;GACxB,aAAa,QAAQ;EACtB;CACF;;;;;AAMD,QAAO;AAMR"}