UNPKG

@onkernel/cu-playwright

Version:

Computer Use x Playwright SDK

257 lines (229 loc) 8.78 kB
import { Anthropic } from '@anthropic-ai/sdk'; import { DateTime } from 'luxon'; import type { Page } from 'playwright'; import type { BetaMessageParam, BetaTextBlock } from './types/beta'; import { ToolCollection, DEFAULT_TOOL_VERSION, TOOL_GROUPS_BY_VERSION, type ToolVersion } from './tools/collection'; import { responseToParams, maybeFilterToNMostRecentImages, injectPromptCaching, PROMPT_CACHING_BETA_FLAG } from './utils/message-processing'; import { makeApiToolResult } from './utils/tool-results'; import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; import type { ActionParams } from './tools/types/computer'; import { Action } from './tools/types/computer'; // System prompt optimized for the environment const SYSTEM_PROMPT = `<SYSTEM_CAPABILITY> * You are utilising an Ubuntu virtual machine using ${process.arch} architecture with internet access. * When you connect to the display, CHROMIUM IS ALREADY OPEN. The url bar is not visible but it is there. * If you need to navigate to a new page, use ctrl+l to focus the url bar and then enter the url. * You won't be able to see the url bar from the screenshot but ctrl-l still works. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. * Either that, or make sure you scroll down to see everything before deciding something isn't available. * When using your computer function calls, they take a while to run and send back to you. * Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. * After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. * Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. * Only when you confirm a step was executed correctly should you move on to the next one. </SYSTEM_CAPABILITY> <IMPORTANT> * When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". * Instead, click on the search bar on the center of the screen where it says "Search or enter address", and enter the appropriate search term or URL there. </IMPORTANT>`; // Add new type definitions interface ThinkingConfig { type: 'enabled'; budget_tokens: number; } interface ExtraBodyConfig { thinking?: ThinkingConfig; } interface ToolUseInput extends Record<string, unknown> { action: Action; } export async function samplingLoop({ model, systemPromptSuffix, messages, apiKey, onlyNMostRecentImages, maxTokens = 4096, toolVersion, thinkingBudget, tokenEfficientToolsBeta = false, playwrightPage, }: { model: string; systemPromptSuffix?: string; messages: BetaMessageParam[]; apiKey: string; onlyNMostRecentImages?: number; maxTokens?: number; toolVersion?: ToolVersion; thinkingBudget?: number; tokenEfficientToolsBeta?: boolean; playwrightPage: Page; }): Promise<BetaMessageParam[]> { const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(playwrightPage))); const system: BetaTextBlock = { type: 'text', text: `${SYSTEM_PROMPT}${systemPromptSuffix ? ' ' + systemPromptSuffix : ''}`, }; while (true) { const betas: string[] = toolGroup.beta_flag ? [toolGroup.beta_flag] : []; if (tokenEfficientToolsBeta) { betas.push('token-efficient-tools-2025-02-19'); } let imageTruncationThreshold = onlyNMostRecentImages || 0; const client = new Anthropic({ apiKey, maxRetries: 4 }); const enablePromptCaching = true; if (enablePromptCaching) { betas.push(PROMPT_CACHING_BETA_FLAG); injectPromptCaching(messages); onlyNMostRecentImages = 0; (system as BetaTextBlock).cache_control = { type: 'ephemeral' }; } if (onlyNMostRecentImages) { maybeFilterToNMostRecentImages( messages, onlyNMostRecentImages, imageTruncationThreshold ); } const extraBody: ExtraBodyConfig = {}; if (thinkingBudget) { extraBody.thinking = { type: 'enabled', budget_tokens: thinkingBudget }; } const toolParams = toolCollection.toParams(); const response = await client.beta.messages.create({ max_tokens: maxTokens, messages, model, system: [system], tools: toolParams, betas, ...extraBody, }); const responseParams = responseToParams(response); const loggableContent = responseParams.map(block => { if (block.type === 'tool_use') { return { type: 'tool_use', name: block.name, input: block.input }; } return block; }); console.log('=== LLM RESPONSE ==='); console.log('Stop reason:', response.stop_reason); console.log(loggableContent); console.log("===") messages.push({ role: 'assistant', content: responseParams, }); if (response.stop_reason === 'end_turn') { console.log('LLM has completed its task, ending loop'); return messages; } const toolResultContent = []; let hasToolUse = false; for (const contentBlock of responseParams) { if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input && typeof contentBlock.input === 'object') { const input = contentBlock.input as ToolUseInput; if ('action' in input && typeof input.action === 'string') { hasToolUse = true; const toolInput: ActionParams = { action: input.action as Action, ...Object.fromEntries( Object.entries(input).filter(([key]) => key !== 'action') ) }; try { const result = await toolCollection.run( contentBlock.name, toolInput ); const toolResult = makeApiToolResult(result, contentBlock.id!); toolResultContent.push(toolResult); } catch (error) { console.error(error); throw error; } } } } if (toolResultContent.length === 0 && !hasToolUse && response.stop_reason !== 'tool_use') { console.log('No tool use or results, and not waiting for tool use, ending loop'); return messages; } if (toolResultContent.length > 0) { messages.push({ role: 'user', content: toolResultContent, }); } } } /** * Simplified computer use loop for executing tasks with Claude * * This function provides a higher-level interface to the sampling loop, * accepting a simple query string instead of message arrays. * * @param options - Configuration options * @param options.query - The task description for Claude to execute * @param options.apiKey - Anthropic API key for authentication * @param options.playwrightPage - Playwright page instance to control * @param options.model - Anthropic model to use (default: claude-sonnet-4-20250514) * @param options.systemPromptSuffix - Additional instructions appended to system prompt * @param options.maxTokens - Maximum tokens for response (default: 4096) * @param options.toolVersion - Computer use tool version (auto-selected based on model) * @param options.thinkingBudget - Token budget for Claude's reasoning (default: 1024) * @param options.tokenEfficientToolsBeta - Enable token-efficient tools beta * @param options.onlyNMostRecentImages - Limit number of recent images to include * * @returns Promise resolving to array of conversation messages * * @see https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking */ export async function computerUseLoop({ query, apiKey, playwrightPage, model = 'claude-sonnet-4-20250514', systemPromptSuffix, maxTokens = 4096, toolVersion, thinkingBudget = 1024, tokenEfficientToolsBeta = false, onlyNMostRecentImages, }: { query: string; apiKey: string; playwrightPage: Page; model?: string; systemPromptSuffix?: string; maxTokens?: number; toolVersion?: ToolVersion; thinkingBudget?: number; tokenEfficientToolsBeta?: boolean; onlyNMostRecentImages?: number; }): Promise<BetaMessageParam[]> { return samplingLoop({ model, systemPromptSuffix, messages: [{ role: 'user', content: query, }], apiKey, maxTokens, toolVersion, thinkingBudget, tokenEfficientToolsBeta, onlyNMostRecentImages, playwrightPage, }); }