UNPKG

@centralinc/browseragent

Version:

Browser automation agent using Computer Use with Playwright

565 lines (548 loc) 19.7 kB
import { z } from 'zod'; import { Page } from 'playwright'; import { BetaToolComputerUse20241022, BetaToolComputerUse20250124, BetaTool } from '@anthropic-ai/sdk/resources/beta'; import { BetaMessageParam as BetaMessageParam$1 } from '@anthropic-ai/sdk/resources/beta/messages/messages'; type ControlSignal = "pause" | "resume" | "cancel"; type SignalEvent = "onPause" | "onResume" | "onCancel" | "onError"; type ActionParams$1 = Record<string, unknown>; interface ToolResult { output?: string; error?: string; base64Image?: string; system?: string; } interface FunctionToolDef { name: string; type: "custom"; input_schema: BetaTool.InputSchema; } type ComputerToolDef = BetaToolComputerUse20241022 | BetaToolComputerUse20250124; type ComputerUseToolDef = ComputerToolDef | FunctionToolDef; interface ComputerUseTool { name: string; toParams(): ComputerUseToolDef; call(params: Record<string, unknown>): Promise<ToolResult>; } /** * Configuration options for agent execution behavior */ interface ExecutionConfig { /** Typing behavior configuration */ typing?: TypingConfig; /** Screenshot settings */ screenshot?: ScreenshotConfig; /** Mouse interaction settings */ mouse?: MouseConfig; /** Scrolling behavior settings */ scrolling?: ScrollingConfig; } /** * Typing behavior configuration */ interface TypingConfig { /** * Typing mode with performance characteristics: * - 'fill': Fastest - directly fills input fields bypassing keyboard events entirely (6x+ faster than character-by-character) * - 'character-by-character': Human-like - simulates realistic keyboard events with configurable delays */ mode: "character-by-character" | "fill"; /** Delay between characters when using character-by-character mode (in milliseconds) */ characterDelay?: number; /** Delay after typing completion (in milliseconds) */ completionDelay?: number; } /** * Screenshot configuration */ interface ScreenshotConfig { /** Delay before taking screenshot (in seconds) */ delay?: number; /** Screenshot quality settings */ quality?: "low" | "medium" | "high"; } /** * Mouse interaction configuration */ interface MouseConfig { /** Speed of mouse movements */ moveSpeed?: "instant" | "fast" | "normal" | "slow"; /** Click behavior settings */ clickDelay?: number; } /** * Scrolling behavior configuration */ interface ScrollingConfig { /** Scrolling mode */ mode?: "percentage"; /** Default percentage of viewport to scroll */ percentage?: number; /** Overlap percentage for context */ overlap?: number; } /** * Simple interface for defining Playwright capabilities */ interface PlaywrightCapabilityDef { method: string; displayName: string; description: string; usage: string; schema: z.ZodSchema<unknown>; handler: (page: Page, args: string[]) => Promise<ToolResult>; } /** * Simple logger that logs everything in one place */ interface Logger { /** Log any event with type, message and optional data */ log(type: string, message: string, data?: unknown): void; /** Agent lifecycle methods */ agentStart(query: string, model: string, options?: unknown): void; agentComplete(query: string, duration: number, messageCount: number): void; agentError(query: string, error: Error, duration: number): void; /** LLM response logging */ llmResponse(stopReason: string, step: number, content?: unknown): void; /** Tool execution logging */ toolStart(toolName: string, step: number, input?: unknown): void; toolComplete(toolName: string, step: number, duration: number, output?: unknown): void; toolError(toolName: string, step: number, error: Error, duration: number): void; /** Signal and debug logging */ signal(signal: string, step: number, reason?: string): void; debug(message: string, data?: unknown): void; } /** * Simple console logger that logs everything with timestamps */ declare class SimpleLogger implements Logger { private includeData; constructor(includeData?: boolean); log(type: string, message: string, data?: unknown): void; private truncateScreenshots; agentStart(query: string, model: string, options?: unknown): void; agentComplete(query: string, duration: number, messageCount: number): void; agentError(query: string, error: Error, duration: number): void; llmResponse(stopReason: string, step: number, content?: unknown): void; toolStart(toolName: string, step: number, input?: unknown): void; toolComplete(toolName: string, step: number, duration: number, output?: unknown): void; toolError(toolName: string, step: number, error: Error, duration: number): void; signal(signal: string, step: number, reason?: string): void; debug(message: string, data?: unknown): void; } /** * No-op logger for when logging is disabled */ declare class NoOpLogger implements Logger { log(): void; agentStart(): void; agentComplete(): void; agentError(): void; llmResponse(): void; toolStart(): void; toolComplete(): void; toolError(): void; signal(): void; debug(): void; } interface RetryConfig { maxRetries?: number; initialDelayMs?: number; maxDelayMs?: number; backoffMultiplier?: number; retryableErrors?: string[]; /** Prefer IPv4 DNS resolution to avoid IPv6 connectivity issues (useful with VPNs like Tailscale) */ preferIPv4?: boolean; } /** * Event callback interfaces for agent controller */ interface AgentControllerEvents { onPause: (data: { at: Date; step: number; }) => void; onResume: (data: { at: Date; step: number; }) => void; onCancel: (data: { at: Date; step: number; reason?: string; }) => void; onError: (data: { at: Date; step: number; error: unknown; }) => void; } /** * Controller interface for managing agent execution signals */ interface AgentController { /** Send a control signal to the running agent */ signal(signal: "pause" | "resume" | "cancel"): void; /** Subscribe to agent events - returns unsubscribe function */ on<K extends keyof AgentControllerEvents>(event: K, callback: AgentControllerEvents[K]): () => void; } /** * Computer Use Agent for automating browser interactions with Claude * * This agent provides a clean interface to Anthropic's Computer Use capabilities, * allowing Claude to interact with web pages through Playwright. * * @see https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool */ declare class ComputerUseAgent { private apiKey; private model; private page; private signalBus; private executionConfig?; private playwrightCapabilities; private tools; private logger; private retryConfig; /** Expose control-flow signals */ readonly controller: AgentController; /** * Create a new ComputerUseAgent instance * * @param options - Configuration options * @param options.apiKey - Anthropic API key (get one from https://console.anthropic.com/) * @param options.page - Playwright page instance to control * @param options.model - Anthropic model to use (defaults to claude-sonnet-4-20250514) * @param options.executionConfig - Tool behavior configuration (typing speed, screenshots, etc.) * @param options.playwrightCapabilities - Custom Playwright capabilities for this agent instance * @param options.tools - Additional tools for this agent instance * * @see https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#model-compatibility */ constructor({ apiKey, page, model, executionConfig, playwrightCapabilities, tools, logger, retryConfig, }: { /** Anthropic API key for authentication */ apiKey: string; /** Playwright page instance to control */ page: Page; /** * Anthropic model to use for computer use tasks * @default 'claude-sonnet-4-20250514' */ model?: string; /** * Execution behavior configuration * Controls typing speed, screenshot settings, mouse behavior, etc. */ executionConfig?: ExecutionConfig; /** * Custom Playwright capabilities for this agent instance * These are merged with built-in capabilities */ playwrightCapabilities?: PlaywrightCapabilityDef[]; /** * Additional tools for this agent instance * These can be any ComputerUseTool implementations */ tools?: ComputerUseTool[]; /** * Custom logger for agent operations * @default NoOpLogger (no logging) */ logger?: Logger; /** * Retry configuration for API calls * Controls retry behavior for connection errors */ retryConfig?: RetryConfig; }); /** * Execute a computer use task with Claude * * This method can return either text responses or structured data validated against a Zod schema. * * @template T - The expected return type (string by default, or inferred from schema) * @param query - The task description for Claude to execute * @param schema - Optional Zod schema for structured responses * @param options - Additional execution options * @param options.systemPromptSuffix - Additional instructions appended to the system prompt * @param options.thinkingBudget - Token budget for Claude's internal reasoning (default: 1024) * * @returns Promise that resolves to either a string (when no schema) or validated data of type T * * @example * ```typescript * // Text response * const result = await agent.execute('Tell me the page title'); * * // Structured response * const data = await agent.execute( * 'Get user info', * z.object({ name: z.string(), age: z.number() }) * ); * ``` * * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking */ execute<T = string>(query: string, schema?: z.ZodSchema<T>, options?: { /** Additional instructions appended to the system prompt */ systemPromptSuffix?: string; /** * Token budget for Claude's internal reasoning process * @default undefined (thinking disabled) * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#working-with-thinking-budgets */ thinkingBudget?: number; /** * Maximum tokens for the response * Must be greater than thinkingBudget * @default 4096 */ maxTokens?: number; /** * Limit number of recent images to include in context * Helps prevent "Too much media" errors in long-running tasks * @default undefined (no limit) */ onlyNMostRecentImages?: number; }): Promise<T>; private extractTextFromMessage; private parseJsonResponse; } type PlaywrightActionParams = ActionParams$1 & { method: string; args: string[]; }; declare class PlaywrightTool implements ComputerUseTool { name: "playwright"; protected page: Page; protected capabilities: Map<string, PlaywrightCapabilityDef>; constructor(page: Page, instanceCapabilities?: PlaywrightCapabilityDef[]); /** * Get capability documentation for including in system prompt */ getCapabilityDocs(): string; /** * Static method to get capability docs (for system prompt generation) */ static getCapabilityDocs(): string; toParams(): FunctionToolDef; call(params: PlaywrightActionParams): Promise<ToolResult>; } declare enum Action { MOUSE_MOVE = "mouse_move", LEFT_CLICK = "left_click", RIGHT_CLICK = "right_click", MIDDLE_CLICK = "middle_click", DOUBLE_CLICK = "double_click", TRIPLE_CLICK = "triple_click", LEFT_CLICK_DRAG = "left_click_drag", LEFT_MOUSE_DOWN = "left_mouse_down", LEFT_MOUSE_UP = "left_mouse_up", KEY = "key", TYPE = "type", HOLD_KEY = "hold_key", SCREENSHOT = "screenshot", CURSOR_POSITION = "cursor_position", SCROLL = "scroll", WAIT = "wait", EXTRACT_URL = "extract_url" } type ScrollDirection = "up" | "down" | "left" | "right"; type Coordinate = [number, number]; type Duration = number; interface ActionParams { action: Action; text?: string; coordinate?: Coordinate; scrollDirection?: ScrollDirection; scrollAmount?: number; scroll_amount?: number; duration?: Duration; [key: string]: Action | string | Coordinate | ScrollDirection | number | Duration | undefined; } declare class ComputerTool implements ComputerUseTool { name: "computer"; protected page: Page; protected version: "20241022" | "20250124"; protected config: ExecutionConfig; private readonly mouseActions; private readonly keyboardActions; private readonly systemActions; constructor(page: Page, version?: "20241022" | "20250124", config?: ExecutionConfig); get apiType(): "computer_20241022" | "computer_20250124"; toParams(): ComputerToolDef; private getMouseButton; private handleMouseAction; private handleKeyboardAction; screenshot(): Promise<ToolResult>; call(params: ActionParams): Promise<ToolResult>; } declare class ComputerTool20241022 extends ComputerTool { constructor(page: Page, config?: ExecutionConfig); } declare class ComputerTool20250124 extends ComputerTool { constructor(page: Page, config?: ExecutionConfig); } type ToolVersion = "computer_use_20250124" | "computer_use_20241022" | "computer_use_20250429"; declare class ToolCollection { private tools; private page?; constructor(...tools: ComputerUseTool[]); toParams(): ComputerUseToolDef[]; setPage(page: Page): void; run(name: string, toolInput: Record<string, unknown>): Promise<ToolResult>; } type BetaMessageParam = BetaMessageParam$1; interface BetaTextBlock { type: "text"; text: string; id?: string; cache_control?: { type: "ephemeral"; }; } /** * Tool capability configuration * Generic interface that can be used by any tool (Playwright, Computer, etc.) */ interface ToolCapability { /** Tool name (e.g., 'playwright', 'computer') */ tool: string; /** Method/action name identifier */ method: string; /** Human-readable display name */ displayName: string; /** Short description of what this capability does */ description: string; /** Detailed usage instructions and examples */ usage: string; /** Input parameter schema using Zod for validation */ schema: z.ZodSchema<unknown>; /** Whether this capability is enabled by default */ enabled?: boolean; } /** * Tool registry interface */ interface ToolRegistry { /** Register a new capability */ register(capability: ToolCapability): void; /** Get a capability by tool and method name */ get(tool: string, method: string): ToolCapability | undefined; /** Get all capabilities for a specific tool */ getToolCapabilities(tool: string): ToolCapability[]; /** Get all registered capabilities */ getAll(): ToolCapability[]; /** Check if a capability is enabled */ isEnabled(tool: string, method: string): boolean; /** Generate documentation for a specific tool */ generateToolDocs(tool: string): string; /** Generate documentation for all tools */ generateAllDocs(): string; /** Validate method arguments against capability schema */ validate(tool: string, method: string, args: unknown[]): { valid: boolean; errors?: string[]; }; /** Get all registered tool names */ getToolNames(): string[]; } /** * Decorator metadata for capability registration */ interface CapabilityDecoratorOptions { tool: string; displayName: string; description: string; usage: string; enabled?: boolean; category?: string; examples?: Array<{ description: string; input: Record<string, unknown>; expectedOutput: string; }>; performance?: { speed: string; reliability: string; notes: string; }; } /** * Configuration options for the tool registry */ interface ToolRegistryConfig { /** Whether to include capability documentation in system prompts */ includeInSystemPrompt?: boolean; /** Whether to validate inputs before execution */ validateInputs?: boolean; /** Custom capability filter function */ filter?: (capability: ToolCapability) => boolean; /** Override default enabled state for specific capabilities */ overrides?: Record<string, { enabled: boolean; }>; } /** * Base interface for tools that use the registry */ interface RegistryAwareTool { /** Tool name identifier */ name: string; /** Get the tool registry instance */ getRegistry(): ToolRegistry; /** Get documentation for this tool's capabilities */ getCapabilityDocs(): string; } /** * Get or create the global tool registry */ declare function getToolRegistry(config?: ToolRegistryConfig): ToolRegistry; /** * Reset the global registry (useful for testing) */ declare function resetToolRegistry(): void; /** * Method decorator for registering tool capabilities * * @example * ```typescript * class PlaywrightTool { * @capability({ * tool: 'playwright', * displayName: 'Navigate to URL', * description: 'Navigate directly to any URL or website', * usage: 'Use this to navigate to any website directly without using the URL bar' * }) * @capabilitySchema(z.object({ url: z.string() })) * async executeGoto(args: string[]): Promise<ToolResult> { * // implementation * } * } * ``` */ declare function capability(options: CapabilityDecoratorOptions): MethodDecorator; /** * Parameter schema decorator for capability methods */ declare function capabilitySchema(schema: z.ZodSchema<unknown>): MethodDecorator; /** * Class decorator to automatically register capabilities */ declare function withCapabilities<T extends { new (...args: any[]): object; }>(constructor: T): T; /** * Helper to create a tool capability configuration */ declare function defineCapability(tool: string, method: string, options: Omit<CapabilityDecoratorOptions, "tool"> & { schema?: z.ZodSchema<unknown>; }): ToolCapability; /** * Pre-defined Playwright tool capabilities */ declare const PLAYWRIGHT_CAPABILITIES: ToolCapability[]; /** * Register Playwright capabilities in the tool registry */ declare function registerPlaywrightCapabilities(): void; export { Action, type AgentController, type AgentControllerEvents, type BetaMessageParam, type BetaTextBlock, type CapabilityDecoratorOptions, ComputerTool, ComputerTool20241022, ComputerTool20250124, ComputerUseAgent, type ComputerUseTool, type ComputerUseToolDef, type ControlSignal, type ExecutionConfig, type Logger, type MouseConfig, NoOpLogger, PLAYWRIGHT_CAPABILITIES, type PlaywrightActionParams, type PlaywrightCapabilityDef, PlaywrightTool, type RegistryAwareTool, type RetryConfig, type ScreenshotConfig, type ScrollingConfig, type SignalEvent, SimpleLogger, type ToolCapability, ToolCollection, type ToolRegistry, type ToolRegistryConfig, type ToolResult, type ToolVersion, type TypingConfig, capability, capabilitySchema, defineCapability, getToolRegistry, registerPlaywrightCapabilities, resetToolRegistry, withCapabilities };