UNPKG

@browserbasehq/stagehand

Version:

An AI web browsing framework focused on simplicity and extensibility.

1,408 lines (1,386 loc) 68.8 kB
import z, { ZodType, z as z$1, ZodError, ZodTypeAny } from 'zod/v3'; import { ClientOptions as ClientOptions$2 } from '@anthropic-ai/sdk'; import { LanguageModelV2 } from '@ai-sdk/provider'; import { ClientOptions as ClientOptions$1 } from 'openai'; import { generateObject, generateText, streamText, streamObject, experimental_generateImage, embed, embedMany, experimental_transcribe, experimental_generateSpeech, ToolSet } from 'ai'; import { Client, ClientOptions as ClientOptions$3 } from '@modelcontextprotocol/sdk/client/index.js'; import { Page as Page$1 } from 'playwright-core'; export { Page as PlaywrightPage } from 'playwright-core'; import { Page as Page$2 } from 'puppeteer-core'; export { Page as PuppeteerPage } from 'puppeteer-core'; import { Page as Page$3 } from 'patchright-core'; export { Page as PatchrightPage } from 'patchright-core'; import { Protocol } from 'devtools-protocol'; import { Buffer as Buffer$1 } from 'buffer'; import Browserbase from '@browserbasehq/sdk'; import { ChatCompletion } from 'openai/resources'; import { ToolSet as ToolSet$1 } from 'ai/dist'; import { Schema } from '@google/genai'; type AnthropicJsonSchemaObject = { definitions?: { MySchema?: { properties?: Record<string, unknown>; required?: string[]; }; }; properties?: Record<string, unknown>; required?: string[]; } & Record<string, unknown>; interface LLMTool { type: "function"; name: string; description: string; parameters: Record<string, unknown>; } type AISDKProvider = (modelName: string) => LanguageModelV2; type AISDKCustomProvider = (options: { apiKey: string; }) => AISDKProvider; type AvailableModel = "gpt-4.1" | "gpt-4.1-mini" | "gpt-4.1-nano" | "o4-mini" | "o3" | "o3-mini" | "o1" | "o1-mini" | "gpt-4o" | "gpt-4o-mini" | "gpt-4o-2024-08-06" | "gpt-4.5-preview" | "o1-preview" | "claude-3-5-sonnet-latest" | "claude-3-5-sonnet-20241022" | "claude-3-5-sonnet-20240620" | "claude-3-7-sonnet-latest" | "claude-3-7-sonnet-20250219" | "cerebras-llama-3.3-70b" | "cerebras-llama-3.1-8b" | "groq-llama-3.3-70b-versatile" | "groq-llama-3.3-70b-specdec" | "gemini-1.5-flash" | "gemini-1.5-pro" | "gemini-1.5-flash-8b" | "gemini-2.0-flash-lite" | "gemini-2.0-flash" | "gemini-2.5-flash-preview-04-17" | "gemini-2.5-pro-preview-03-25" | string; type ModelProvider = "openai" | "anthropic" | "cerebras" | "groq" | "google" | "aisdk"; type ClientOptions = ClientOptions$1 | ClientOptions$2; type ModelConfiguration = AvailableModel | (ClientOptions & { modelName: AvailableModel; }); type LogLevel = 0 | 1 | 2; /** * Mapping between numeric log levels and their names * * 0 - error/warn - Critical issues or important warnings * 1 - info - Standard information messages * 2 - debug - Detailed information for debugging */ declare const LOG_LEVEL_NAMES: Record<LogLevel, string>; type LogLine = { id?: string; category?: string; message: string; level?: LogLevel; timestamp?: string; auxiliary?: { [key: string]: { value: string; type: "object" | "string" | "html" | "integer" | "float" | "boolean"; }; }; }; type Logger = (logLine: LogLine) => void; interface ChatMessage { role: "system" | "user" | "assistant"; content: ChatMessageContent; } type ChatMessageContent = string | (ChatMessageImageContent | ChatMessageTextContent)[]; interface ChatMessageImageContent { type: string; image_url?: { url: string; }; text?: string; source?: { type: string; media_type: string; data: string; }; } interface ChatMessageTextContent { type: string; text: string; } declare const AnnotatedScreenshotText = "This is a screenshot of the current page state with the elements annotated on it. Each element id is annotated with a number to the top left of it. Duplicate annotations at the same location are under each other vertically."; interface ChatCompletionOptions { messages: ChatMessage[]; temperature?: number; top_p?: number; frequency_penalty?: number; presence_penalty?: number; image?: { buffer: Buffer; description?: string; }; response_model?: { name: string; schema: ZodType; }; tools?: LLMTool[]; tool_choice?: "auto" | "none" | "required"; maxOutputTokens?: number; requestId?: string; } type LLMResponse = { id: string; object: string; created: number; model: string; choices: { index: number; message: { role: string; content: string | null; tool_calls: { id: string; type: string; function: { name: string; arguments: string; }; }[]; }; finish_reason: string; }[]; usage: { prompt_tokens: number; completion_tokens: number; total_tokens: number; }; }; interface CreateChatCompletionOptions { options: ChatCompletionOptions; logger: (message: LogLine) => void; retries?: number; } declare abstract class LLMClient { type: "openai" | "anthropic" | "cerebras" | "groq" | (string & {}); modelName: AvailableModel | (string & {}); hasVision: boolean; clientOptions: ClientOptions; userProvidedInstructions?: string; constructor(modelName: AvailableModel, userProvidedInstructions?: string); abstract createChatCompletion<T = LLMResponse & { usage?: LLMResponse["usage"]; }>(options: CreateChatCompletionOptions): Promise<T>; generateObject: typeof generateObject; generateText: typeof generateText; streamText: typeof streamText; streamObject: typeof streamObject; generateImage: typeof experimental_generateImage; embed: typeof embed; embedMany: typeof embedMany; transcribe: typeof experimental_transcribe; generateSpeech: typeof experimental_generateSpeech; getLanguageModel?(): LanguageModelV2; } /** * CDP transport & session multiplexer * * Owns the browser WebSocket and multiplexes flattened Target sessions. * Tracks inflight CDP calls, routes responses to the right session, and forwards events. * * This does not interpret Page/DOM/Runtime semantics — callers own that logic. */ interface CDPSessionLike { send<R = unknown>(method: string, params?: object): Promise<R>; on<P = unknown>(event: string, handler: (params: P) => void): void; off<P = unknown>(event: string, handler: (params: P) => void): void; close(): Promise<void>; readonly id: string | null; } type EventHandler = (params: unknown) => void; declare class CdpConnection implements CDPSessionLike { private ws; private nextId; private inflight; private eventHandlers; private sessions; readonly id: string | null; private transportCloseHandlers; onTransportClosed(handler: (why: string) => void): void; offTransportClosed(handler: (why: string) => void): void; private emitTransportClosed; private constructor(); static connect(wsUrl: string): Promise<CdpConnection>; enableAutoAttach(): Promise<void>; send<R = unknown>(method: string, params?: object): Promise<R>; on<P = unknown>(event: string, handler: (params: P) => void): void; off<P = unknown>(event: string, handler: (params: P) => void): void; close(): Promise<void>; getSession(sessionId: string): CdpSession | undefined; attachToTarget(targetId: string): Promise<CdpSession>; getTargets(): Promise<Array<{ targetId: string; type: string; url: string; }>>; private onMessage; _sendViaSession<R = unknown>(sessionId: string, method: string, params?: object): Promise<R>; _onSessionEvent(sessionId: string, event: string, handler: EventHandler): void; _offSessionEvent(sessionId: string, event: string, handler: EventHandler): void; _dispatchToSession(sessionId: string, event: string, params: unknown): void; } declare class CdpSession implements CDPSessionLike { private readonly root; readonly id: string; constructor(root: CdpConnection, id: string); send<R = unknown>(method: string, params?: object): Promise<R>; on<P = unknown>(event: string, handler: (params: P) => void): void; off<P = unknown>(event: string, handler: (params: P) => void): void; close(): Promise<void>; dispatch(event: string, params: unknown): void; } interface FrameManager { session: CDPSessionLike; frameId: string; pageId: string; } /** * Frame * * A thin, session-bound handle to a specific DOM frame (by frameId). * All CDP calls in this class go through `this.session`, which MUST be the * owning session for `this.frameId`. Page is responsible for constructing * Frames with the correct session. */ declare class Frame implements FrameManager { session: CDPSessionLike; frameId: string; pageId: string; /** Owning CDP session id (useful for logs); null for root connection (should not happen for targets) */ readonly sessionId: string | null; constructor(session: CDPSessionLike, frameId: string, pageId: string); /** DOM.getNodeForLocation → DOM.describeNode */ getNodeAtLocation(x: number, y: number): Promise<Protocol.DOM.Node>; /** CSS selector → DOM.querySelector → DOM.getBoxModel */ getLocationForSelector(selector: string): Promise<{ x: number; y: number; width: number; height: number; }>; /** Accessibility.getFullAXTree (+ recurse into child frames if requested) */ getAccessibilityTree(withFrames?: boolean): Promise<Protocol.Accessibility.AXNode[]>; /** * Evaluate a function or expression in this frame's isolated world. * - If a string is provided, treated as a JS expression. * - If a function is provided, it is stringified and invoked with the optional argument. */ evaluate<R = unknown, Arg = unknown>(pageFunctionOrExpression: string | ((arg: Arg) => R | Promise<R>), arg?: Arg): Promise<R>; /** Page.captureScreenshot (frame-scoped session) */ screenshot(options?: { fullPage?: boolean; clip?: { x: number; y: number; width: number; height: number; }; }): Promise<Buffer>; /** Child frames via Page.getFrameTree */ childFrames(): Promise<Frame[]>; /** Wait for a lifecycle state (load/domcontentloaded/networkidle) */ waitForLoadState(state?: "load" | "domcontentloaded" | "networkidle"): Promise<void>; /** Simple placeholder for your own locator abstraction */ locator(selector: string, options?: { deep?: boolean; depth?: number; }): Locator; /** Create/get an isolated world for this frame and return its executionContextId */ private getExecutionContextId; } type MouseButton = "left" | "right" | "middle"; /** * Locator * * Purpose: * A small, CDP-based element interaction helper scoped to a specific `Frame`. * It resolves a CSS/XPath selector inside the frame’s **isolated world**, and then * performs low-level actions (click, type, select) using DOM/Runtime/Input * protocol domains with minimal abstraction. * * Key change: * - Prefer **objectId**-based CDP calls (scroll, geometry) to avoid brittle * frontend nodeId mappings. nodeId is resolved on a best-effort basis and * returned for compatibility, but actions do not depend on it. * * Notes: * - Resolution is lazy: every action resolves the selector again. * - Uses `Page.createIsolatedWorld` so evaluation is isolated from page scripts. * - Releases remote objects (`Runtime.releaseObject`) where appropriate. */ declare class Locator { private readonly frame; private readonly selector; private readonly options?; private readonly selectorResolver; private readonly selectorQuery; private readonly nthIndex; constructor(frame: Frame, selector: string, options?: { deep?: boolean; depth?: number; }, nthIndex?: number); /** Return the owning Frame for this locator (typed accessor, no private access). */ getFrame(): Frame; /** * Set files on an <input type="file"> element. * * Mirrors Playwright's Locator.setInputFiles basics: * - Accepts file path(s) or payload object(s) { name, mimeType, buffer }. * - Uses CDP DOM.setFileInputFiles under the hood. * - Best‑effort dispatches change/input via CDP (Chrome does by default). * - Passing an empty array clears the selection. */ setInputFiles(files: string | string[] | { name: string; mimeType: string; buffer: ArrayBuffer | Uint8Array | Buffer$1 | string; } | Array<{ name: string; mimeType: string; buffer: ArrayBuffer | Uint8Array | Buffer$1 | string; }>): Promise<void>; /** * Return the DOM backendNodeId for this locator's target element. * Useful for identity comparisons without needing element handles. */ backendNodeId(): Promise<Protocol.DOM.BackendNodeId>; /** Return how many nodes the current selector resolves to. */ count(): Promise<number>; /** * Return the center of the element's bounding box in the owning frame's viewport * (CSS pixels), rounded to integers. Scrolls into view best-effort. */ centroid(): Promise<{ x: number; y: number; }>; /** * Highlight the element's bounding box using the CDP Overlay domain. * - Scrolls element into view best-effort. * - Shows a semi-transparent overlay briefly, then hides it. */ highlight(options?: { durationMs?: number; borderColor?: { r: number; g: number; b: number; a?: number; }; contentColor?: { r: number; g: number; b: number; a?: number; }; }): Promise<void>; /** * Move the mouse cursor to the element's visual center without clicking. * - Scrolls into view best-effort, resolves geometry, then dispatches a mouse move. */ hover(): Promise<void>; /** * Click the element at its visual center. * Steps: * 1) Resolve selector to { objectId } in the frame world. * 2) Scroll into view via `DOM.scrollIntoViewIfNeeded({ objectId })`. * 3) Read geometry via `DOM.getBoxModel({ objectId })` → compute a center point. * 4) Synthesize mouse press + release via `Input.dispatchMouseEvent`. */ click(options?: { button?: MouseButton; clickCount?: number; }): Promise<void>; /** * Dispatch a DOM 'click' MouseEvent on the element itself. * - Does not synthesize real pointer input; directly dispatches an event. * - Useful for elements that rely on click handlers without needing hit-testing. */ sendClickEvent(options?: { bubbles?: boolean; cancelable?: boolean; composed?: boolean; detail?: number; }): Promise<void>; /** * Scroll the element vertically to a given percentage (0–100). * - If the element is <html> or <body>, scrolls the window/document. * - Otherwise, scrolls the element itself via element.scrollTo. */ scrollTo(percent: number | string): Promise<void>; /** * Fill an input/textarea/contenteditable element. * Mirrors Playwright semantics: the DOM helper either applies the native * value setter (for special input types) or asks us to type text via the CDP * Input domain after focusing/selecting. */ fill(value: string): Promise<void>; /** * Type text into the element (focuses first). * - Focus via element.focus() in page JS (no DOM.focus(nodeId)). * - If no delay, uses `Input.insertText` for efficiency. * - With delay, synthesizes `keyDown`/`keyUp` per character. */ type(text: string, options?: { delay?: number; }): Promise<void>; /** * Select one or more options on a `<select>` element. * Returns the values actually selected after the operation. */ selectOption(values: string | string[]): Promise<string[]>; /** * Return true if the element is attached and visible (rough heuristic). */ isVisible(): Promise<boolean>; /** * Return true if the element is an input[type=checkbox|radio] and is checked. * Also considers aria-checked for ARIA widgets. */ isChecked(): Promise<boolean>; /** * Return the element's input value (for input/textarea/select/contenteditable). */ inputValue(): Promise<string>; /** * Return the element's textContent (raw, not innerText). */ textContent(): Promise<string>; /** * Return the element's innerHTML string. */ innerHtml(): Promise<string>; /** * Return the element's innerText (layout-aware, visible text). */ innerText(): Promise<string>; /** * For API parity, returns the same locator (querySelector already returns the first match). */ first(): Locator; /** Return a locator narrowed to the element at the given zero-based index. */ nth(index: number): Locator; /** * Resolve `this.selector` within the frame to `{ objectId, nodeId? }`: * Delegates to a shared selector resolver so all selector logic stays in sync. */ resolveNode(): Promise<{ nodeId: Protocol.DOM.NodeId | null; objectId: Protocol.Runtime.RemoteObjectId; }>; /** Compute a center point from a BoxModel content quad */ private centerFromBoxContent; } /** * DeepLocatorDelegate: a lightweight wrapper that looks like a Locator and * resolves to the correct frame/element on each call using hop/deep-XPath logic. * * Returned by `page.deepLocator()` for ergonomic, await-free chaining: * page.deepLocator('iframe#ifrA >> #btn').click() */ declare class DeepLocatorDelegate { private readonly page; private readonly root; private readonly selector; private readonly nthIndex; constructor(page: Page, root: Frame, selector: string, nthIndex?: number); private real; click(options?: { button?: "left" | "right" | "middle"; clickCount?: number; }): Promise<void>; count(): Promise<number>; hover(): Promise<void>; fill(value: string): Promise<void>; type(text: string, options?: { delay?: number; }): Promise<void>; selectOption(values: string | string[]): Promise<string[]>; scrollTo(percent: number | string): Promise<void>; isVisible(): Promise<boolean>; isChecked(): Promise<boolean>; inputValue(): Promise<string>; textContent(): Promise<string>; innerHtml(): Promise<string>; innerText(): Promise<string>; centroid(): Promise<{ x: number; y: number; }>; backendNodeId(): Promise<number>; highlight(options?: { durationMs?: number; borderColor?: { r: number; g: number; b: number; a?: number; }; contentColor?: { r: number; g: number; b: number; a?: number; }; }): Promise<void>; sendClickEvent(options?: { bubbles?: boolean; cancelable?: boolean; composed?: boolean; detail?: number; }): Promise<void>; setInputFiles(files: string | string[] | { name: string; mimeType: string; buffer: ArrayBuffer | Uint8Array | Buffer | string; } | Array<{ name: string; mimeType: string; buffer: ArrayBuffer | Uint8Array | Buffer | string; }>): Promise<void>; first(): DeepLocatorDelegate; nth(index: number): DeepLocatorDelegate; } /** * FrameLocator: resolves iframe elements to their child Frames and allows * creating locators scoped to that frame. Supports chaining. */ declare class FrameLocator { private readonly parent?; private readonly selector; private readonly page; private readonly root?; constructor(page: Page, selector: string, parent?: FrameLocator, root?: Frame); /** Create a nested FrameLocator under this one. */ frameLocator(selector: string): FrameLocator; /** Resolve to the concrete Frame for this FrameLocator chain. */ resolveFrame(): Promise<Frame>; /** Return a Locator scoped to this frame. Methods delegate to the frame lazily. */ locator(selector: string): LocatorDelegate; } /** A small delegating wrapper that resolves the frame lazily per call. */ declare class LocatorDelegate { private readonly fl; private readonly sel; constructor(fl: FrameLocator, sel: string); private real; click(options?: { button?: "left" | "right" | "middle"; clickCount?: number; }): Promise<void>; hover(): Promise<void>; fill(value: string): Promise<void>; type(text: string, options?: { delay?: number; }): Promise<void>; selectOption(values: string | string[]): Promise<string[]>; scrollTo(percent: number | string): Promise<void>; isVisible(): Promise<boolean>; isChecked(): Promise<boolean>; inputValue(): Promise<string>; textContent(): Promise<string>; innerHtml(): Promise<string>; innerText(): Promise<string>; count(): Promise<number>; first(): LocatorDelegate; } type AnyPage = Page$1 | Page$2 | Page$3 | Page; type LoadState = "load" | "domcontentloaded" | "networkidle"; declare class StagehandAPIClient { private apiKey; private projectId; private sessionId?; private modelApiKey; private logger; private fetchWithCookies; constructor({ apiKey, projectId, logger }: StagehandAPIConstructorParams); init({ modelName, modelApiKey, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, }: StartSessionParams): Promise<StartSessionResult>; act({ input, options, frameId }: APIActParameters): Promise<ActResult>; extract<T extends z.AnyZodObject>({ instruction, schema: zodSchema, options, frameId, }: APIExtractParameters): Promise<ExtractResult<T>>; observe({ instruction, options, frameId, }: APIObserveParameters): Promise<Action[]>; goto(url: string, options?: { waitUntil?: "load" | "domcontentloaded" | "networkidle"; }, frameId?: string): Promise<void>; agentExecute(agentConfig: AgentConfig, executeOptions: AgentExecuteOptions | string, frameId?: string): Promise<AgentResult>; end(): Promise<Response>; private execute; private request; } declare class Page { private readonly conn; private readonly mainSession; private readonly _targetId; /** Every CDP child session this page owns (top-level + adopted OOPIF sessions). */ private readonly sessions; /** Unified truth for frame topology + ownership. */ private readonly registry; /** A convenience wrapper bound to the current main frame id (top-level session). */ private mainFrameWrapper; /** Compact ordinal per frameId (used by snapshot encoding). */ private frameOrdinals; private nextOrdinal; /** cache Frames per frameId so everyone uses the same one */ private readonly frameCache; /** Stable id for Frames created by this Page (use top-level TargetId). */ private readonly pageId; /** Cached current URL for synchronous page.url() */ private _currentUrl; private navigationCommandSeq; private latestNavigationCommandId; private readonly networkManager; /** Optional API client for routing page operations to the API */ private readonly apiClient; private constructor(); private cursorEnabled; private ensureCursorScript; enableCursorOverlay(): Promise<void>; private updateCursor; /** * Factory: create Page and seed registry with the shallow tree from Page.getFrameTree. * Assumes Page domain is already enabled on the session passed in. */ static create(conn: CdpConnection, session: CDPSessionLike, targetId: string, apiClient?: StagehandAPIClient | null, localBrowserLaunchOptions?: LocalBrowserLaunchOptions | null): Promise<Page>; /** * Parent/child session emitted a `frameAttached`. * Topology update + ownership stamped to **emitting session**. */ onFrameAttached(frameId: string, parentId: string | null, session: CDPSessionLike): void; /** * Parent/child session emitted a `frameDetached`. */ onFrameDetached(frameId: string, reason?: "remove" | "swap" | string): void; /** * Parent/child session emitted a `frameNavigated`. * Topology + ownership update. Handles root swaps. */ onFrameNavigated(frame: Protocol.Page.Frame, session: CDPSessionLike): void; onNavigatedWithinDocument(frameId: string, url: string, session: CDPSessionLike): void; /** * An OOPIF child session whose **main** frame id equals the parent iframe’s frameId * has been attached; adopt the session into this Page and seed ownership for its subtree. */ adoptOopifSession(childSession: CDPSessionLike, childMainFrameId: string): void; /** Detach an adopted child session and prune its subtree */ detachOopifSession(sessionId: string): void; /** Return the owning CDP session for a frameId (falls back to main session) */ getSessionForFrame(frameId: string): CDPSessionLike; /** Always returns a Frame bound to the owning session */ frameForId(frameId: string): Frame; /** Expose a session by id (used by snapshot to resolve session id -> session) */ getSessionById(id: string): CDPSessionLike | undefined; registerSessionForNetwork(session: CDPSessionLike): void; unregisterSessionForNetwork(sessionId: string | undefined): void; targetId(): string; /** Seed the cached URL before navigation events converge. */ seedCurrentUrl(url: string | undefined | null): void; mainFrameId(): string; mainFrame(): Frame; /** * Close this top-level page (tab). Best-effort via Target.closeTarget. */ close(): Promise<void>; getFullFrameTree(): Protocol.Page.FrameTree; asProtocolFrameTree(rootMainFrameId: string): Protocol.Page.FrameTree; private ensureOrdinal; /** Public getter for snapshot code / handlers. */ getOrdinal(frameId: string): number; listAllFrameIds(): string[]; /** * Navigate the page; optionally wait for a lifecycle state. * Waits on the **current** main frame and follows root swaps during navigation. */ goto(url: string, options?: { waitUntil?: LoadState; timeoutMs?: number; }): Promise<void>; /** * Reload the page; optionally wait for a lifecycle state. */ reload(options?: { waitUntil?: LoadState; timeoutMs?: number; ignoreCache?: boolean; }): Promise<void>; /** * Navigate back in history if possible; optionally wait for a lifecycle state. */ goBack(options?: { waitUntil?: LoadState; timeoutMs?: number; }): Promise<void>; /** * Navigate forward in history if possible; optionally wait for a lifecycle state. */ goForward(options?: { waitUntil?: LoadState; timeoutMs?: number; }): Promise<void>; /** * Return the current page URL (synchronous, cached from navigation events). */ url(): string; private beginNavigationCommand; isCurrentNavigationCommand(id: number): boolean; /** * Return the current page title. * Prefers reading from the active document via Runtime.evaluate to reflect dynamic changes. * Falls back to navigation history title if evaluation is unavailable. */ title(): Promise<string>; /** * Capture a screenshot (delegated to the current main frame). */ screenshot(options?: { fullPage?: boolean; }): Promise<Buffer>; /** * Create a locator bound to the current main frame. */ locator(selector: string): ReturnType<Frame["locator"]>; /** * Deep locator that supports cross-iframe traversal. * - Recognizes '>>' hop notation to enter iframe contexts. * - Supports deep XPath that includes iframe steps (e.g., '/html/body/iframe[2]//div'). * Returns a Locator scoped to the appropriate frame. */ deepLocator(selector: string): DeepLocatorDelegate; /** * Frame locator similar to Playwright: targets iframe elements and scopes * subsequent locators to that frame. Supports chaining. */ frameLocator(selector: string): FrameLocator; /** * List all frames belonging to this page as Frame objects bound to their owning sessions. * The list is ordered by a stable ordinal assigned during the page lifetime. */ frames(): Frame[]; /** * Wait until the page reaches a lifecycle state on the current main frame. * Mirrors Playwright's API signatures. */ waitForLoadState(state: LoadState, timeoutMs?: number): Promise<void>; /** * Evaluate a function or expression in the current main frame's isolated world. * - If a string is provided, it is treated as a JS expression. * - If a function is provided, it is stringified and invoked with the optional argument. * - The return value should be JSON-serializable. Non-serializable objects will * best-effort serialize via JSON.stringify inside the page context. */ evaluate<R = unknown, Arg = unknown>(pageFunctionOrExpression: string | ((arg: Arg) => R | Promise<R>), arg?: Arg): Promise<R>; /** * Force the page viewport to an exact CSS size and device scale factor. * Ensures screenshots match width x height pixels when deviceScaleFactor = 1. */ setViewportSize(width: number, height: number, options?: { deviceScaleFactor?: number; }): Promise<void>; /** * Click at absolute page coordinates (CSS pixels). * Dispatches mouseMoved → mousePressed → mouseReleased via CDP Input domain * on the top-level page target's session. Coordinates are relative to the * viewport origin (top-left). Does not scroll. */ click(x: number, y: number, options: { button?: "left" | "right" | "middle"; clickCount?: number; returnXpath: true; }): Promise<string>; click(x: number, y: number, options?: { button?: "left" | "right" | "middle"; clickCount?: number; returnXpath?: false; }): Promise<void>; click(x: number, y: number, options: { button?: "left" | "right" | "middle"; clickCount?: number; returnXpath: boolean; }): Promise<void | string>; scroll(x: number, y: number, deltaX: number, deltaY: number, options: { returnXpath: true; }): Promise<string>; scroll(x: number, y: number, deltaX: number, deltaY: number, options?: { returnXpath?: false; }): Promise<void>; scroll(x: number, y: number, deltaX: number, deltaY: number, options: { returnXpath: boolean; }): Promise<void | string>; /** * Drag from (fromX, fromY) to (toX, toY) using mouse events. * Sends mouseMoved → mousePressed → mouseMoved (steps) → mouseReleased. */ dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options: { button?: "left" | "right" | "middle"; steps?: number; delay?: number; returnXpath: true; }): Promise<[string, string]>; dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options?: { button?: "left" | "right" | "middle"; steps?: number; delay?: number; returnXpath?: false; }): Promise<void>; dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options: { button?: "left" | "right" | "middle"; steps?: number; delay?: number; returnXpath: boolean; }): Promise<void | [string, string]>; /** * Type a string by dispatching keyDown/keyUp events per character. * Focus must already be on the desired element. Uses CDP Input.dispatchKeyEvent * and never falls back to Input.insertText. Optional delay applies between * successive characters. */ type(text: string, options?: { delay?: number; withMistakes?: boolean; }): Promise<void>; /** * Press a single key or key combination (keyDown then keyUp). * For printable characters, uses the text path on keyDown; for named keys, sets key/code/VK. * Supports key combinations with modifiers like "Cmd+A", "Ctrl+C", "Shift+Tab", etc. */ keyPress(key: string, options?: { delay?: number; }): Promise<void>; private _pressedModifiers; /** Press a key down without releasing it */ private keyDown; /** Release a pressed key */ private keyUp; /** Normalize modifier key names to match CDP expectations */ private normalizeModifierKey; /** * Get the map of named keys with their properties */ private getNamedKeys; /** * Minimal description for printable keys (letters/digits/space) to provide code and VK. * Used when non-Shift modifiers are pressed to avoid sending text while keeping accelerator info. */ private describePrintableKey; private isMacOS; /** * Return Chromium mac editing commands (without trailing ':') for a given code like 'KeyA' * Only used on macOS to trigger system editing shortcuts (e.g., selectAll, copy, paste...). */ private macCommandsFor; /** * Create an isolated world for the **current** main frame and return its context id. */ private createIsolatedWorldForCurrentMain; /** * Wait until the **current** main frame reaches a lifecycle state. * - Fast path via `document.readyState`. * - Event path listens at the session level and compares incoming `frameId` * to `mainFrameId()` **at event time** to follow root swaps. */ waitForMainLoadState(state: LoadState, timeoutMs?: number): Promise<void>; } interface AgentAction { type: string; reasoning?: string; taskCompleted?: boolean; action?: string; timeMs?: number; pageText?: string; pageUrl?: string; instruction?: string; [key: string]: unknown; } interface AgentResult { success: boolean; message: string; actions: AgentAction[]; completed: boolean; metadata?: Record<string, unknown>; usage?: { input_tokens: number; output_tokens: number; inference_time_ms: number; }; } interface AgentExecuteOptions { instruction: string; maxSteps?: number; page?: Page$1 | Page$2 | Page$3 | Page; highlightCursor?: boolean; } type AgentType = "openai" | "anthropic" | "google"; declare const AVAILABLE_CUA_MODELS: readonly ["openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-3-7-sonnet-latest", "anthropic/claude-haiku-4-5-20251001", "anthropic/claude-sonnet-4-20250514", "anthropic/claude-sonnet-4-5-20250929", "google/gemini-2.5-computer-use-preview-10-2025"]; type AvailableCuaModel = (typeof AVAILABLE_CUA_MODELS)[number]; interface AgentExecutionOptions<TOptions extends AgentExecuteOptions = AgentExecuteOptions> { options: TOptions; logger: (message: LogLine) => void; retries?: number; } interface AgentHandlerOptions { modelName: string; clientOptions?: Record<string, unknown>; userProvidedInstructions?: string; experimental?: boolean; } interface ActionExecutionResult { success: boolean; error?: string; data?: unknown; } interface ToolUseItem extends ResponseItem { type: "tool_use"; id: string; name: string; input: Record<string, unknown>; } interface AnthropicMessage { role: string; content: string | Array<AnthropicContentBlock>; } interface AnthropicContentBlock { type: string; [key: string]: unknown; } interface AnthropicTextBlock extends AnthropicContentBlock { type: "text"; text: string; } interface AnthropicToolResult { type: "tool_result"; tool_use_id: string; content: string | Array<AnthropicContentBlock>; } interface ResponseItem { type: string; id: string; [key: string]: unknown; } interface ComputerCallItem extends ResponseItem { type: "computer_call"; call_id: string; action: { type: string; [key: string]: unknown; }; pending_safety_checks?: Array<{ id: string; code: string; message: string; }>; } interface FunctionCallItem extends ResponseItem { type: "function_call"; call_id: string; name: string; arguments: string; } type ResponseInputItem = { role: string; content: string; } | { type: "computer_call_output"; call_id: string; output: { type: "input_image"; image_url: string; current_url?: string; error?: string; [key: string]: unknown; } | string; acknowledged_safety_checks?: Array<{ id: string; code: string; message: string; }>; } | { type: "function_call_output"; call_id: string; output: string; }; interface AgentInstance { execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentResult>; } type AgentProviderType = AgentType; type AgentModelConfig<TModelName extends string = string> = { modelName: TModelName; } & Record<string, unknown>; type AgentConfig = { /** * Custom system prompt to provide to the agent. Overrides the default system prompt. */ systemPrompt?: string; /** * MCP integrations - Array of Client objects */ integrations?: (Client | string)[]; /** * Tools passed to the agent client */ tools?: ToolSet; /** * Indicates CUA is disabled for this configuration */ cua?: boolean; /** * The model to use for agent functionality */ model?: string | AgentModelConfig<string>; /** * The model to use for tool execution (observe/act calls within agent tools). * If not specified, inherits from the main model configuration. * Format: "provider/model" (e.g., "openai/gpt-4o-mini", "google/gemini-2.0-flash-exp") */ executionModel?: string | AgentModelConfig<string>; }; declare class StagehandAPIError extends Error { constructor(message: string); } declare class StagehandAPIUnauthorizedError extends StagehandAPIError { constructor(message?: string); } declare class StagehandHttpError extends StagehandAPIError { constructor(message: string); } declare class StagehandServerError extends StagehandAPIError { constructor(message: string); } declare class StagehandResponseBodyError extends StagehandAPIError { constructor(); } declare class StagehandResponseParseError extends StagehandAPIError { constructor(message: string); } interface ActOptions { model?: ModelConfiguration; variables?: Record<string, string>; timeout?: number; page?: Page$1 | Page$2 | Page$3 | Page; } interface ActResult { success: boolean; message: string; actionDescription: string; actions: Action[]; } type ExtractResult<T extends z$1.AnyZodObject> = z$1.infer<T>; interface Action { selector: string; description: string; method?: string; arguments?: string[]; } interface HistoryEntry { method: "act" | "extract" | "observe" | "navigate" | "agent"; parameters: unknown; result: unknown; timestamp: string; } interface ExtractOptions { model?: ModelConfiguration; timeout?: number; selector?: string; page?: Page$1 | Page$2 | Page$3 | Page; } declare const defaultExtractSchema: z$1.ZodObject<{ extraction: z$1.ZodString; }, "strip", z$1.ZodTypeAny, { extraction?: string; }, { extraction?: string; }>; declare const pageTextSchema: z$1.ZodObject<{ pageText: z$1.ZodString; }, "strip", z$1.ZodTypeAny, { pageText?: string; }, { pageText?: string; }>; interface ObserveOptions { model?: ModelConfiguration; timeout?: number; selector?: string; page?: Page$1 | Page$2 | Page$3 | Page; } declare enum V3FunctionName { ACT = "ACT", EXTRACT = "EXTRACT", OBSERVE = "OBSERVE", AGENT = "AGENT" } interface StagehandMetrics { actPromptTokens: number; actCompletionTokens: number; actInferenceTimeMs: number; extractPromptTokens: number; extractCompletionTokens: number; extractInferenceTimeMs: number; observePromptTokens: number; observeCompletionTokens: number; observeInferenceTimeMs: number; agentPromptTokens: number; agentCompletionTokens: number; agentInferenceTimeMs: number; totalPromptTokens: number; totalCompletionTokens: number; totalInferenceTimeMs: number; } type V3Env = "LOCAL" | "BROWSERBASE"; /** Local launch options for V3 (chrome-launcher + CDP). * Matches v2 shape where feasible; unsupported fields are accepted but ignored. */ interface LocalBrowserLaunchOptions { args?: string[]; executablePath?: string; userDataDir?: string; preserveUserDataDir?: boolean; headless?: boolean; devtools?: boolean; chromiumSandbox?: boolean; ignoreDefaultArgs?: boolean | string[]; proxy?: { server: string; bypass?: string; username?: string; password?: string; }; locale?: string; viewport?: { width: number; height: number; }; deviceScaleFactor?: number; hasTouch?: boolean; ignoreHTTPSErrors?: boolean; cdpUrl?: string; connectTimeoutMs?: number; downloadsPath?: string; acceptDownloads?: boolean; } /** Constructor options for V3 */ interface V3Options { env: V3Env; apiKey?: string; projectId?: string; /** * Optional: fine-tune Browserbase session creation or resume an existing session. */ browserbaseSessionCreateParams?: Omit<Browserbase.Sessions.SessionCreateParams, "projectId"> & { projectId?: string; }; browserbaseSessionID?: string; localBrowserLaunchOptions?: LocalBrowserLaunchOptions; model?: ModelConfiguration; llmClient?: LLMClient; systemPrompt?: string; logInferenceToFile?: boolean; experimental?: boolean; verbose?: 0 | 1 | 2; selfHeal?: boolean; /** Disable pino logging backend (useful for tests or minimal environments). */ disablePino?: boolean; /** Optional external logger hook for integrating with host apps. */ logger?: (line: LogLine) => void; /** Directory used to persist cached actions for act(). */ cacheDir?: string; domSettleTimeout?: number; disableAPI?: boolean; } declare class StagehandError extends Error { constructor(message: string); } declare class StagehandDefaultError extends StagehandError { constructor(error?: unknown); } declare class StagehandEnvironmentError extends StagehandError { constructor(currentEnvironment: string, requiredEnvironment: string, feature: string); } declare class MissingEnvironmentVariableError extends StagehandError { constructor(missingEnvironmentVariable: string, feature: string); } declare class UnsupportedModelError extends StagehandError { constructor(supportedModels: string[], feature?: string); } declare class UnsupportedModelProviderError extends StagehandError { constructor(supportedProviders: string[], feature?: string); } declare class UnsupportedAISDKModelProviderError extends StagehandError { constructor(provider: string, supportedProviders: string[]); } declare class InvalidAISDKModelFormatError extends StagehandError { constructor(modelName: string); } declare class StagehandNotInitializedError extends StagehandError { constructor(prop: string); } declare class BrowserbaseSessionNotFoundError extends StagehandError { constructor(); } declare class CaptchaTimeoutError extends StagehandError { constructor(); } declare class MissingLLMConfigurationError extends StagehandError { constructor(); } declare class HandlerNotInitializedError extends StagehandError { constructor(handlerType: string); } declare class StagehandInvalidArgumentError extends StagehandError { constructor(message: string); } declare class StagehandElementNotFoundError extends StagehandError { constructor(xpaths: string[]); } declare class AgentScreenshotProviderError extends StagehandError { constructor(message: string); } declare class StagehandMissingArgumentError extends StagehandError { constructor(message: string); } declare class CreateChatCompletionResponseError extends StagehandError { constructor(message: string); } declare class StagehandEvalError extends StagehandError { constructor(message: string); } declare class StagehandDomProcessError extends StagehandError { constructor(message: string); } declare class StagehandClickError extends StagehandError { constructor(message: string, selector: string); } declare class LLMResponseError extends StagehandError { constructor(primitive: string, message: string); } declare class StagehandIframeError extends StagehandError { constructor(frameUrl: string, message: string); } declare class ContentFrameNotFoundError extends StagehandError { constructor(selector: string); } declare class XPathResolutionError extends StagehandError { constructor(xpath: string); } declare class ExperimentalApiConflictError extends StagehandError { constructor(); } declare class ExperimentalNotConfiguredError extends StagehandError { constructor(featureName: string); } declare class ZodSchemaValidationError extends Error { readonly received: unknown; readonly issues: ReturnType<ZodError["format"]>; constructor(received: unknown, issues: ReturnType<ZodError["format"]>); } declare class StagehandInitError extends StagehandError { constructor(message: string); } declare class MCPConnectionError extends StagehandError { readonly serverUrl: string; readonly originalError: unknown; constructor(serverUrl: string, originalError: unknown); } declare class StagehandShadowRootMissingError extends StagehandError { constructor(detail?: string); } declare class StagehandShadowSegmentEmptyError extends StagehandError { constructor(); } declare class StagehandShadowSegmentNotFoundError extends StagehandError { constructor(segment: string, hint?: string); } declare class AISdkClient extends LLMClient { type: "aisdk"; private model; constructor({ model }: { model: LanguageModelV2; }); createChatCompletion<T = ChatCompletion>({ options, }: CreateChatCompletionOptions): Promise<T>; } interface StagehandAPIConstructorParams { apiKey: string; projectId: string; logger: (message: LogLine) => void; } interface StartSessionParams { modelName: string; modelApiKey: string; domSettleTimeoutMs: number; verbose: number; systemPrompt?: string; browserbaseSessionCreateParams?: Omit<Browserbase.Sessions.SessionCreateParams, "projectId"> & { projectId?: string; }; selfHeal?: boolean; browserbaseSessionID?: string; } interface StartSessionResult { sessionId: string; available?: boolean; } interface APIActParameters { input: string | Action; options?: ActOptions; frameId?: string; } interface APIExtractParameters { instruction?: string; schema?: ZodTypeAny; options?: ExtractOptions; frameId?: string; } interface APIObserveParameters { instruction?: string; options?: ObserveOptions; frameId?: string; } /** * Represents a path through a Zod schema from the root object down to a * particular field. The `segments` array describes the chain of keys/indices. * * - **String** segments indicate object property names. * - **Number** segments indicate array indices. * * For example, `["users", 0, "homepage"]` might describe reaching * the `homepage` field in `schema.users[0].homepage`. */ interface ZodPathSegments { /** * The ordered list of keys/indices leading from the schema root * to the targeted field. */ segments: Array<string | number>; } type EvaluateOptions = { /** The question to ask about the task state */ question: string; /** The answer to the question */ answer?: string; /** Whether to take a screenshot of the task state, or array of screenshots to evaluate */ screenshot?: boolean | Buffer[]; /** Custom system prompt for the evaluator */ systemPrompt?: string; /** Delay in milliseconds before taking the screenshot @default 250 */ screenshotDelayMs?: number; /** The agent's reasoning/thought process for completing the task */ agentReasoning?: string; }; type BatchAskOptions = { /** Array of questions with optional answers */ questions: Array<{ question: string; answer?: string; }>; /** Whether to take a screenshot of the task state */ screenshot?: boolean; /** Custom system prompt for the evaluator */ systemPrompt?: string; /** Delay in milliseconds before taking the screenshot @default 1000 */ screenshotDelayMs?: number; }; /** * Result of an evaluation */ interface EvaluationResult { /** * The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected) */ evaluation: "YES" | "NO" | "INVALID"; /** * The reasoning behind the evaluation */ reasoning: string; } /** * V3Context * * Owns the root CDP connection and wires Target/Page events into Page. * Maintains one Page per top-level target, adopts OOPIF child sessions into the owner Page, * and tracks target→page and (root) frame→target mappings for lookups. * * IMPORTANT: FrameId → session ownership is manag