UNPKG

donobu

Version:

Create browser automations with an LLM agent and replay them as Playwright scripts.

1,023 lines (1,019 loc) • 65.6 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DonobuFlow = void 0; exports.extractFromPage = extractFromPage; const crypto_1 = require("crypto"); const GptPlatformInsufficientQuotaException_1 = require("../exceptions/GptPlatformInsufficientQuotaException"); const GptPlatformInternalErrorException_1 = require("../exceptions/GptPlatformInternalErrorException"); const UserInterruptException_1 = require("../exceptions/UserInterruptException"); const FlowMetadata_1 = require("../models/FlowMetadata"); const InteractableElement_1 = require("../models/InteractableElement"); const ToolCallResult_1 = require("../models/ToolCallResult"); const AcknowledgeUserInstruction_1 = require("../tools/AcknowledgeUserInstruction"); const HandleBrowserDialogTool_1 = require("../tools/HandleBrowserDialogTool"); const MarkObjectiveCompleteTool_1 = require("../tools/MarkObjectiveCompleteTool"); const MarkObjectiveNotCompletableTool_1 = require("../tools/MarkObjectiveNotCompletableTool"); const JsonSchemaUtils_1 = require("../utils/JsonSchemaUtils"); const JsonUtils_1 = require("../utils/JsonUtils"); const Logger_1 = require("../utils/Logger"); const MiscUtils_1 = require("../utils/MiscUtils"); const PlaywrightUtils_1 = require("../utils/PlaywrightUtils"); /** * Return an object conforming to the given JSON-schema. The object will be * generated considering the given target and tool call history. */ async function extractFromPage(instruction, zodSchema, screenshot, toolCallHistory, gptClient, options) { const finalScreenshot = screenshot; const systemMessage = `You help people navigate applications and return relevant information. For our purposes, we call this overall process running a "Donobu Flow", with you being named Donobu. The current date in yyyy-MM-dd format is ${new Date().toISOString().split('T')[0]} You will be given a history of what you have done so far, a screenshot of the current webpage, and a JSON schema. Return an appropriatly formatted JSON conforming to the given schema. Additionally, the following is the original instruction from the user, of which, MAY or MAY NOT be relevant for generating the appropriate JSON... Original instruction: ${instruction}`; const formattedToolCallHistory = JSON.stringify(toolCallHistory .map((tc) => { return JsonUtils_1.JsonUtils.objectToJson(tc); }) .map((tc) => { delete tc.id; delete tc.postCallImageId; tc.outcome.result = tc.outcome.forLlm; delete tc.outcome.forLlm; delete tc.outcome.metadata; return tc; }), null, 2); const userMessage = `The data gathered for the current Donobu Flow via your tool calls is: ${formattedToolCallHistory} `; const userMessageScreenshot = finalScreenshot ? { type: 'jpeg', bytes: finalScreenshot, } : null; const userMessageItem = { type: 'text', text: userMessage, }; // Set up timeout to prevent indefinite hangs const timeoutMillis = options?.timeout ?? 60000; const abortController = new AbortController(); const timeoutId = setTimeout(() => { abortController.abort(`Extract operation timed out after ${timeoutMillis} milliseconds`); }, timeoutMillis); try { const structuredOutputMessage = await gptClient.getStructuredOutput([ { type: 'system', text: systemMessage, }, { type: 'user', items: userMessageScreenshot ? [userMessageScreenshot, userMessageItem] : [userMessageItem], }, ], zodSchema, { signal: abortController.signal }); return structuredOutputMessage; } finally { clearTimeout(timeoutId); } } /** * This is the main business-logic class of Donobu. This class goes through a * flow via its `run` method. */ class DonobuFlow { constructor(flowsManager, envData, persistence, gptClient, toolManager, interactionVisualizer, proposedToolCalls, invokedToolCalls, gptMessages, targetInspector, metadata, controlPanel) { this.flowsManager = flowsManager; this.envData = envData; this.persistence = persistence; this.gptClient = gptClient; this.toolManager = toolManager; this.interactionVisualizer = interactionVisualizer; this.proposedToolCalls = proposedToolCalls; this.invokedToolCalls = invokedToolCalls; this.gptMessages = gptMessages; this.targetInspector = targetInspector; this.metadata = metadata; this.controlPanel = controlPanel; this.inProgressToolCall = null; this.aiQueries = []; } /** * Drives the entire Donobu flow state-machine until it reaches a * terminal state. * * The method loops indefinitely, delegating to a handler that matches the * current {@link metadata.state}. Each handler may mutate state, enqueue * tool calls, and/or persist data. After the handler returns, * {@link transitionState} decides the next state (taking into account user * signals, control-panel input, or tool-call outcomes). * * High-level sequence per iteration: * 1. Refresh the control-panel UI. * 2. Invoke the appropriate `on*` handler for the active state. * 3. If the flow is *not* yet in a terminal state, compute and persist the * next state via `transitionState`; otherwise execute {@link onComplete} * and exit the loop. * * Robustness features: * - **Error handling:** Specific exceptions such as * {@link PageClosedException} (page vanished) and * {@link GptPlatformInternalErrorException} (persistent LLM failure) * are converted into a `FAILED` state; any other unhandled error bubbles * up after setting the flow result accordingly. * * Side-effects (per successful iteration): * - Persists tool calls, screenshots, token counts, and mutated * {@link FlowMetadata} via the injected {@link FlowsPersistence}. * - May write Playwright storage state for later sessions. * - Optionally POSTs flow-completion callbacks (`callbackUrl`). * * @returns A promise resolving to the object stored in * {@link metadata.result}, or `null` when the flow ended without an * explicit result. */ async run() { while (true) { try { this.controlPanel.update({ state: this.metadata.state }); switch (this.metadata.state) { case 'UNSTARTED': await this.onUnstarted(); break; case 'INITIALIZING': await this.onInitializing(); break; case 'RUNNING_ACTION': await this.onRunningAction(); break; case 'QUERYING_LLM_FOR_NEXT_ACTION': await this.onQueryingLlmForNextAction(); break; case 'WAITING_ON_USER_FOR_NEXT_ACTION': await this.onWaitingForUserForNextAction(); break; case 'PAUSED': await this.onPaused(); break; case 'RESUMING': await this.onResuming(); break; case 'FAILED': await this.onFailed(); break; case 'SUCCESS': await this.onSuccess(); break; } if ((0, FlowMetadata_1.isComplete)(this.metadata.state)) { await this.onComplete(); break; } else { const userAction = this.controlPanel.popLatestUserAction(); if (userAction) { throw new UserInterruptException_1.UserInterruptException(userAction); } await this.transitionState(); } this.targetInspector.checkTargetAliveOrThrow(); } catch (error) { if (this.targetInspector.isTargetClosedError(error)) { await this.onTargetClosed(); } else if (error instanceof GptPlatformInsufficientQuotaException_1.GptPlatformInsufficientQuotaException) { await this.onInsufficientQuota(error); } else if (error instanceof GptPlatformInternalErrorException_1.GptPlatformInternalErrorException) { await this.onPersistentGptFailure(error); } else if (error instanceof UserInterruptException_1.UserInterruptException) { await this.onUserInterruption(error); } else { await this.onUnexpectedException(error); } } } return this.metadata.result; } /** * Delegates to the inspector to attempt recovery after the target is * closed. If recovery fails, the flow is marked as failed. */ async onTargetClosed() { const result = await this.targetInspector.handleTargetClosed(); if (!result.recovered) { // Persist browser state BEFORE flipping the in-memory `state` to // a terminal value. FlowCatalog.getFlowById serves the *live* // FlowMetadata object for LOCAL deployments, so the next frontend // poll observes terminal state the moment we mutate `state`. // If we do that before the (potentially network-bound) session // upload, an eager browser-state fetch from the frontend (e.g. // FlowDeveloperTools auto-loads on terminal) races the upload and // 404s. Same rationale as the ordering in transitionState. Logger_1.appLogger.error(result.reason); await this.persistTerminalSessionStateIfNeeded(); this.metadata.result = { failed: result.reason }; this.metadata.state = 'FAILED'; await this.persistence.setFlowMetadata(this.metadata); } } /** * This method is called if there are persistent GPT platform failures (there * are internal retries). This method will mark the flow as a failure. */ async onPersistentGptFailure(error) { Logger_1.appLogger.error(`Stopped flow due to the ${this.gptClient?.config.type} GPT platform throwing an internal error!`, error); await this.persistTerminalSessionStateIfNeeded(); this.metadata.result = { failed: `Stopped flow due to the ${this.gptClient?.config.type} GPT platform throwing an internal error!`, context: error.message, }; this.metadata.state = 'FAILED'; await this.persistence.setFlowMetadata(this.metadata); } /** * This method is called when the AI platform reports that the account's * usage quota or credits have been exhausted (HTTP 402). */ async onInsufficientQuota(error) { const platform = error.gptPlatform; const isDonobu = platform === 'DONOBU'; const failedMessage = isDonobu ? 'Your Donobu AI credits have been exhausted. Please add more credits to your account to continue running flows.' : `Your ${platform} API quota has been exhausted. Please check your account's billing and usage limits; this may happen if there is a lack of funds in the account`; Logger_1.appLogger.error(failedMessage, error); await this.persistTerminalSessionStateIfNeeded(); this.metadata.result = { failed: failedMessage }; this.metadata.state = 'FAILED'; await this.persistence.setFlowMetadata(this.metadata); } /** * This method is called when a user interrupts the flow. * It handles the user action and sets the appropriate flow state. * * Note that this *bypasses* the normal state transition logic! */ async onUserInterruption(error) { // Do nothing if we are already ending the flow. if ((0, FlowMetadata_1.isComplete)(this.metadata.state)) { return; } const { userAction } = error; Logger_1.appLogger.info(`Handling user interruption: ${userAction.type}`); // Set the next state based on user action switch (userAction.type) { case 'PAUSE': this.metadata.state = 'PAUSED'; await this.targetInspector.hideInteractionCursor(); break; case 'RESUME': // Handle user instruction if provided const userInstruction = userAction.userInstruction?.trim(); if (userInstruction) { const inst = this.gptClient ? userInstruction : 'Unable to acknowledge user instruction due to no GPT client being available!'; Logger_1.appLogger.debug(`Adding user interrupt to flow history: ${inst}`); // Create a user message and add it to the history this.gptMessages.push({ type: 'user', items: [ { type: 'text', text: `${DonobuFlow.USER_INTERRUPT_MARKER}: ${inst}`, }, ], }); this.proposedToolCalls.length = 0; const toolCall = { id: MiscUtils_1.MiscUtils.createAdHocToolCallId(), toolName: AcknowledgeUserInstruction_1.AcknowledgeUserInstructionTool.NAME, parameters: { userInstruction: inst, }, outcome: { isSuccessful: true, forLlm: inst, metadata: null, }, postCallImageId: null, page: this.targetInspector.getCurrentLocation(), startedAt: new Date().getTime(), completedAt: new Date().getTime(), }; this.invokedToolCalls.push(toolCall); await this.persistence.setToolCall(this.metadata.id, toolCall); // Since we received a user instruction, we need to let the LLM // decide what to do with it. if (this.gptClient) { this.metadata.runMode = 'AUTONOMOUS'; } } if (this.metadata.runMode === 'AUTONOMOUS') { await this.targetInspector.showInteractionCursor(); } this.metadata.state = 'RESUMING'; break; case 'END': // Add completion tool call this.proposedToolCalls.length = 0; this.proposedToolCalls.push({ name: MarkObjectiveCompleteTool_1.MarkObjectiveCompleteTool.NAME, parameters: { rationale: 'User ended flow.', details: 'User ended flow.', }, }); this.metadata.state = 'RUNNING_ACTION'; break; } await this.persistence.setFlowMetadata(this.metadata); } /** * This method is called if there is an unhandled unexpected exception. This * method will mark the flow as a failure. */ async onUnexpectedException(error) { Logger_1.appLogger.error('Stopped flow due to exception!', error); await this.persistTerminalSessionStateIfNeeded(); this.metadata.result = { failed: 'Internal error 🙈', }; this.metadata.state = 'FAILED'; await this.persistence.setFlowMetadata(this.metadata); } /** * This method is called when a flow is complete (i.e. when {@link DonobuFlow.run} should return). * * Browser session state and the terminal-state metadata write are * committed by whichever code path produced the terminal state * (transitionState for tool-driven completion; onTargetClosed / * onPersistentGptFailure / onInsufficientQuota / onUnexpectedException * for failure paths) — by the time we reach onComplete those have * already happened. This method just runs the post-completion side * effects. */ async onComplete() { DonobuFlow.invokeFlowFinishedCallback(this.metadata.callbackUrl, this.metadata.id); this.controlPanel.close(); } /** * Persists the current browser session state if the flow's config has * `persistState` enabled. Must be called BEFORE the in-memory `state` * is mutated to a terminal value at every site that produces a * terminal state — otherwise FlowCatalog.getFlowById can read the * live FlowMetadata object (LOCAL deployments) and a frontend that * observes the terminal state will race the (potentially network- * bound) upload here, getting a 404 from a subsequent browser-state * fetch. * * The browser context typically survives all-pages-closed (the read * goes against the context, not a specific page), so this is safe to * call from failure handlers like onTargetClosed. If the read does * fail, persistSessionState catches and logs internally — it doesn't * propagate. */ async persistTerminalSessionStateIfNeeded() { if (this.metadata.web?.browser?.persistState) { await this.targetInspector.persistSessionState(this.persistence, this.metadata.id); } } /** * Attempt to POST a JSON body containing given flow ID to the given * ${@link callbackUrl} if the URL is non-null. Note that there is no retying * if the POST fails for any reason; this is a best-effort 1-shot try. */ static invokeFlowFinishedCallback(callbackUrl, flowId) { if (!callbackUrl) { return; } try { fetch(callbackUrl, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ id: flowId, }), }).catch((error) => { Logger_1.appLogger.error(`Failed to invoke flow completion callback at ${callbackUrl}`, error); }); } catch (error) { Logger_1.appLogger.error(`Failed to invoke flow completion callback at ${callbackUrl}`, error); } } async onDialog(dialog) { // Since this function is run as an async callback, it can never leak an exception // or else it will crash the whole program, so we wrap everything in a giant try/catch // and just log on error. try { const startedAt = new Date().getTime(); switch (dialog.type()) { case 'confirm': case 'prompt': { const maybeHandleBrowserDialogTool = this.proposedToolCalls[0]; if (maybeHandleBrowserDialogTool?.name === HandleBrowserDialogTool_1.HandleBrowserDialogTool.NAME) { // Handle rerun case this.proposedToolCalls.shift(); const paramsForRerun = maybeHandleBrowserDialogTool.parameters; const textParam = JsonUtils_1.JsonUtils.objectToJson(paramsForRerun).text; if (dialog.type() === 'confirm') { if (textParam === 'true') { await dialog.accept(); } else { await dialog.dismiss(); } } else if (textParam === null || textParam === undefined) { await dialog.dismiss(); } else { await dialog.accept(textParam); } const postCallImage = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(dialog.page()); const postCallImageId = await this.persistence.saveScreenShot(this.metadata.id, postCallImage); const completedAt = new Date().getTime(); const toolCall = { id: MiscUtils_1.MiscUtils.createAdHocToolCallId(), toolName: HandleBrowserDialogTool_1.HandleBrowserDialogTool.NAME, parameters: JsonUtils_1.JsonUtils.objectToJson(paramsForRerun), outcome: ToolCallResult_1.ToolCallResult.successful(), postCallImageId: postCallImageId, page: dialog.page().url(), startedAt: startedAt, completedAt: completedAt, }; this.invokedToolCalls.push(toolCall); await this.persistence.setToolCall(this.metadata.id, toolCall); } else if (this.metadata.runMode === 'AUTONOMOUS') { try { this.metadata.state = 'PAUSED'; // Ask LLM what to do with only one tool choice const gptMessagesCopy = DonobuFlow.createOptimizedHistoryForGptCall(this.gptMessages); const prompt = `IMPORTANT: Now, a webpage dialog has popped up on ${dialog.page()?.url()} and must be handled! Type: "${dialog.type()}" Message: ${dialog.message()}`; const userMessage = { type: 'user', items: [{ type: 'text', text: prompt }], }; let toolCallResult; let parameters = {}; try { const proposedToolCallsMessage = await this.queryGptWithRetry([...gptMessagesCopy, userMessage], [new HandleBrowserDialogTool_1.HandleBrowserDialogTool()]); Logger_1.appLogger.debug('LLM response for handling browser pop-up dialog:', JsonUtils_1.JsonUtils.objectToJson(proposedToolCallsMessage)); MiscUtils_1.MiscUtils.updateTokenCounts(proposedToolCallsMessage, this.metadata); const rawToolCallProposal = proposedToolCallsMessage.proposedToolCalls[0]; // WARNING: Dismissing/accepting the dialog MUST happen before we meaningfully // interact with the webpage, otherwise, Playwright will freeze! if (rawToolCallProposal.parameters) { const confirmationDecision = rawToolCallProposal.parameters; if (dialog.type() === 'confirm') { if (confirmationDecision.text === 'true') { await dialog.accept(); } else { await dialog.dismiss(); } } else if (!confirmationDecision.text) { await dialog.dismiss(); } else { await dialog.accept(confirmationDecision.text); } toolCallResult = ToolCallResult_1.ToolCallResult.successful(); parameters = confirmationDecision; } else { await dialog.dismiss(); toolCallResult = { isSuccessful: false, forLlm: `Unexpected response (${JSON.stringify(rawToolCallProposal)}) for handling dialog! Defaulted to dismissing the dialog!`, metadata: null, }; } } catch (error) { Logger_1.appLogger.error('Failed to handle browser pop-up dialog due to exception! Dismissing...', error); await dialog.dismiss(); toolCallResult = { isSuccessful: false, forLlm: 'Unexpected exception when handling dialog! Defaulted to dismissing the dialog!', metadata: null, }; } const postCallImage = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(dialog.page()); const postCallImageId = await this.persistence.saveScreenShot(this.metadata.id, postCallImage); const completedAt = new Date().getTime(); const toolCall = { id: MiscUtils_1.MiscUtils.createAdHocToolCallId(), toolName: HandleBrowserDialogTool_1.HandleBrowserDialogTool.NAME, parameters: parameters, outcome: toolCallResult, postCallImageId: postCallImageId, page: dialog.page().url(), startedAt: startedAt, completedAt: completedAt, }; this.invokedToolCalls.push(toolCall); await this.persistence.setToolCall(this.metadata.id, toolCall); } finally { this.metadata.nextState = 'QUERYING_LLM_FOR_NEXT_ACTION'; } } else { // Handle instruct mode - user manually handles dialog const dialogResponse = { current: '' }; try { await dialog.page().waitForEvent('console', { predicate: (message) => { if (message.text().startsWith('DONOBU_DIALOG_RESPONSE')) { if (message.args().length <= 1) { Logger_1.appLogger.error(`Missing args for DONOBU_DIALOG_RESPONSE for dialog: ${dialog.message()}`); } else { // Get the second argument which contains the response dialogResponse.current = message.args()[1].toString(); } return true; } return false; }, }); const postCallImage = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(dialog.page()); const postCallImageId = await this.persistence.saveScreenShot(this.metadata.id, postCallImage); const completedAt = new Date().getTime(); const toolCall = { id: MiscUtils_1.MiscUtils.createAdHocToolCallId(), toolName: HandleBrowserDialogTool_1.HandleBrowserDialogTool.NAME, parameters: { rationale: 'User action', text: dialogResponse.current, }, outcome: ToolCallResult_1.ToolCallResult.successful(), postCallImageId: postCallImageId, page: dialog.page().url(), startedAt: startedAt, completedAt: completedAt, }; this.invokedToolCalls.push(toolCall); await this.persistence.setToolCall(this.metadata.id, toolCall); } catch (error) { // Handle any timeout or other errors Logger_1.appLogger.error('Error waiting for dialog response:', error); await dialog.dismiss(); } } break; } default: { Logger_1.appLogger.info(`Automatically dismissing dialog of type ${dialog.type()} with contents: ${dialog.message()}`); await dialog.dismiss(); } } } catch (error) { Logger_1.appLogger.error('Unexpected exception while handling dialog!', error); } } /** * Transitions the flow to its next state. After this method completes, the * `this.metadata.state` will have been updated and the * `this.metadata.nextState` will have been cleared. */ async transitionState() { let nextState = this.metadata.nextState; // If there is no focused page and we would be transitioning to a state // that assumes one, then fail the flow. if (!this.targetInspector.target.current) { switch (nextState) { case 'QUERYING_LLM_FOR_NEXT_ACTION': case 'WAITING_ON_USER_FOR_NEXT_ACTION': case 'PAUSED': case 'RESUMING': case 'RUNNING_ACTION': this.proposedToolCalls.length = 0; this.proposedToolCalls.push({ name: MarkObjectiveNotCompletableTool_1.MarkObjectiveNotCompletableTool.NAME, parameters: { rationale: 'Browser window closed.', }, }); break; } } if (!nextState) { // This means that neither the control panel, nor tool call, or anything // is pushing for a particular next state, so we just do a boring if/else // rules check. if (this.proposedToolCalls.length > 0) { // We have tool calls that need to be run, so lets just do that. nextState = 'RUNNING_ACTION'; } else { // We have no tool calls to run, so now things are based on the current // run mode of the flow... switch (this.metadata.runMode) { case 'AUTONOMOUS': // The LLM is driving the flow, so ask the LLM what to do next. nextState = 'QUERYING_LLM_FOR_NEXT_ACTION'; break; case 'INSTRUCT': // A user is driving the flow, so wait for them to tell us what to // do next. nextState = 'WAITING_ON_USER_FOR_NEXT_ACTION'; break; case 'DETERMINISTIC': // Nobody is driving the flow, and we have run out of things to do, // so end the flow. const lastToolCall = this.invokedToolCalls[this.invokedToolCalls.length - 1]; nextState = lastToolCall?.outcome.isSuccessful ? 'SUCCESS' : 'FAILED'; break; default: throw new Error(`Unknown Donobu flow run mode: ${this.metadata.runMode}`); } } } // Check if the next state would complete the flow so that we can set // the result object before the final state is set. If we did not do // this, then someone polling for a flow's state may see the flow // finished but still see a null result. // // The same rationale applies to the browser session state — see // persistTerminalSessionStateIfNeeded for the full ordering note. if ((0, FlowMetadata_1.isComplete)(nextState)) { this.metadata.result = await this.createResultJson(nextState); await this.persistTerminalSessionStateIfNeeded(); } const lastState = this.metadata.state; this.metadata.state = nextState; this.metadata.nextState = null; await this.persistence.setFlowMetadata(this.metadata); if (lastState !== nextState) { Logger_1.appLogger.info(`Transitioned flow state from ${lastState} to ${nextState}`); } } /** * Returns an object formatted according to the following priority: * * <ol> * <li>If the `result` for this flow is non-null, then it is returned as-is. * <li>If the given next state of the flow is `SUCCESS`, the * `resultJsonSchema` is non-null, and the `gptClient` is non-null, * then an attempt is made to take the context of the entire run and * conform it to this schema. If there is an error when attempting to * map the data of the current flow to the `resultJsonSchema`, then an * object with error details is returned. * <li>Otherwise, the `metadata` of the last tool call is returned. * </ol> */ async createResultJson(nextState) { if (this.metadata.result) { return this.metadata.result; } if (nextState === 'SUCCESS' && this.metadata.resultJsonSchema && this.gptClient) { try { const screenshot = this.targetInspector.connected ? await this.targetInspector.captureScreenshot() : null; const structuredOutputMessage = await extractFromPage(this.metadata.overallObjective ?? 'Generate an object conforming to the given JSON-schema', (0, JsonSchemaUtils_1.jsonSchemaToZod)(this.metadata.resultJsonSchema), screenshot, this.invokedToolCalls, this.gptClient); MiscUtils_1.MiscUtils.updateTokenCounts(structuredOutputMessage, this.metadata); return structuredOutputMessage.output; } catch (error) { return { exception: typeof error, message: error.message, note: 'Unexpected exception while attempting to create the result object for this flow.', }; } } const lastToolCall = this.invokedToolCalls[this.invokedToolCalls.length - 1]; return lastToolCall?.outcome.metadata ?? lastToolCall?.outcome ?? null; } /** * All this method does is set the next state to {@link State.INITIALIZING}. */ async onUnstarted() { this.metadata.nextState = 'INITIALIZING'; } /** * This method sets up the page initialization scripts and bindings, and * initializes the GPT message history. */ async onInitializing() { this.metadata.startedAt = new Date().getTime(); this.gptMessages.push(DonobuFlow.createSystemMessageForOverallObjective(this.metadata.envVars, this.metadata.overallObjective, this.targetInspector)); if (this.proposedToolCalls.length > 0) { this.gptMessages.push({ type: 'user', items: [ { type: 'text', text: 'Pursue the objective in the system prompt.' }, ], }); } await this.targetInspector.initialize({ metadata: this.metadata, dialogHandler: (dialog) => this.onDialog(dialog), interactionTrackingHost: this, }); } async onRunningAction() { const proposedToolCall = this.proposedToolCalls.shift(); if (!proposedToolCall) { return; } if (this.metadata.maxToolCalls !== null && this.invokedToolCalls.length >= this.metadata.maxToolCalls) { this.metadata.result = { failed: `Stopped flow due to hitting the maximum of ${this.metadata.maxToolCalls} actions.`, }; this.metadata.nextState = 'FAILED'; return; } const isUserDirected = !proposedToolCall.toolCallId; const finalProposedToolCall = isUserDirected ? { ...proposedToolCall, toolCallId: MiscUtils_1.MiscUtils.createAdHocToolCallId(), } : proposedToolCall; // Poll for user-directed state changes from the control panel while the // tool is running. let poller = null; let userInterruptException = null; const startControlPanelStatePolling = () => { if (poller) { clearInterval(poller); } poller = setInterval(() => { const userAction = this.controlPanel.popLatestUserAction(); if (!userAction) { return; } userInterruptException = new UserInterruptException_1.UserInterruptException(userAction); clearInterval(poller); }, 250); }; // Start polling before invoking the tool. startControlPanelStatePolling(); const toolCallContext = { flowsManager: this.flowsManager, envData: this.envData, targetInspector: this.targetInspector, controlPanel: this.controlPanel, persistence: this.persistence, gptClient: this.gptClient, interactionVisualizer: this.interactionVisualizer, proposedToolCalls: this.proposedToolCalls, invokedToolCalls: this.invokedToolCalls, metadata: this.metadata, toolCallId: finalProposedToolCall.toolCallId, }; let toolCall; this.inProgressToolCall = { id: finalProposedToolCall.toolCallId, toolName: finalProposedToolCall.name, parameters: finalProposedToolCall.parameters ?? {}, outcome: null, postCallImageId: null, page: '', startedAt: Date.now(), completedAt: null, }; try { toolCall = await this.toolManager.invokeTool(toolCallContext, finalProposedToolCall.name, finalProposedToolCall.parameters, !isUserDirected); } finally { this.inProgressToolCall = null; // The tool has finished. Stop the poller. if (poller) { clearInterval(poller); } } // The `invokeTool` call does not throw PageClosedException since we would // then lose the `toolCall` data, so instead, it packs it in the `toolCall` // result metadata. const exceptionName = toolCall.outcome.metadata?.exception; if (exceptionName === 'PageClosedException' || exceptionName === 'DeviceClosedException') { await this.onTargetClosed(); } if (isUserDirected) { this.updateGptMessagesWithUserProposedToolCall(finalProposedToolCall); } this.gptMessages.push({ type: 'tool_call_result', toolName: toolCall.toolName, data: toolCall.outcome.forLlm, toolCallId: toolCall.id, }); // If we are running deterministically (i.e. a pre-canned series of tool calls), // then bail early if one of the calls fails, since there is no way for either // an AI or a human to help course-correct. if (!toolCall.outcome.isSuccessful && this.metadata.runMode === 'DETERMINISTIC') { // Removing the remaining tool calls causes the whole flow to fail and // sets the result to be the status of the last processed call. this.proposedToolCalls.length = 0; } if (userInterruptException) { throw userInterruptException; } } async onQueryingLlmForNextAction() { if (!this.gptClient) { throw new Error('Cannot query the GPT with the client set to null.'); } const proposedToolCallsMessage = await this.queryGptForProposedToolCalls(); this.proposedToolCalls.push(...proposedToolCallsMessage.proposedToolCalls); this.gptMessages.push(proposedToolCallsMessage); } async onWaitingForUserForNextAction() { try { if (this.targetInspector.connected) { await DonobuFlow.sleep(100); } } catch (error) { if (!this.targetInspector.isTargetClosedError(error)) { throw error; } } } async onPaused() { try { if (this.targetInspector.connected) { await DonobuFlow.sleep(100); } } catch (error) { if (!this.targetInspector.isTargetClosedError(error)) { throw error; } } // Continue to pause assuming we have not been told to do otherwise if (this.metadata.nextState === null) { this.metadata.nextState = 'PAUSED'; } } async onResuming() { // This state is just so that the flow can get out of a pause loop. this.metadata.nextState = null; } async onFailed() { this.metadata.completedAt = new Date().getTime(); this.metadata.nextState = null; await this.persistence.setFlowMetadata(this.metadata); Logger_1.appLogger.error(`Completed flow with state: ${this.metadata.state}`); } async onSuccess() { this.metadata.completedAt = new Date().getTime(); this.metadata.nextState = null; await this.persistence.setFlowMetadata(this.metadata); Logger_1.appLogger.info(`Completed flow with state: ${this.metadata.state}`); } updateGptMessagesWithUserProposedToolCall(proposedToolCall) { let hasUpdated = false; // Iterate through messages in reverse to find the right place to update/insert for (let i = this.gptMessages.length - 1; i >= 0 && !hasUpdated; i--) { const msg = this.gptMessages[i]; if (msg.type === 'tool_call_result') { // This is fine, it just means we had multiple tool calls in a row continue; } if (msg.type === 'proposed_tool_calls') { // Update existing ProposedToolCallsMessage const promptTokensUsed = msg.promptTokensUsed; const completionTokensUsed = msg.completionTokensUsed; const updatedProposedToolCalls = [ ...msg.proposedToolCalls, proposedToolCall, ]; const updatedProposedToolCallsMessage = { type: 'proposed_tool_calls', proposedToolCalls: updatedProposedToolCalls, promptTokensUsed: promptTokensUsed, completionTokensUsed: completionTokensUsed, }; // REPLACE the existing proposed tool call message this.gptMessages[i] = updatedProposedToolCallsMessage; hasUpdated = true; } else if (msg.type === 'user') { // Insert a ProposedToolCallsMessage after the user message const proposedToolCallsMessage = { type: 'proposed_tool_calls', proposedToolCalls: [proposedToolCall], promptTokensUsed: 0, completionTokensUsed: 0, }; // INSERT the new message after the user message this.gptMessages.splice(i + 1, 0, proposedToolCallsMessage); hasUpdated = true; } else if (msg.type === 'system') { // Insert both UserMessage and ProposedToolCallsMessage after system message const userMessage = { type: 'user', items: [ { type: 'text', text: `Run the ${proposedToolCall.name} tool.` }, ], }; const proposedToolCallsMessage = { type: 'proposed_tool_calls', proposedToolCalls: [proposedToolCall], promptTokensUsed: 0, completionTokensUsed: 0, }; // INSERT the new messages after the system message this.gptMessages.splice(i + 1, 0, userMessage); this.gptMessages.splice(i + 2, 0, proposedToolCallsMessage); hasUpdated = true; } else if (msg.type === 'assistant' || msg.type === 'structured_output') { // This is entirely unexpected, as we do not use the GPT like this when // running a flow normally. throw new Error(`Unexpected message type in the GPT message history: ${msg.constructor.name}`); } } if (!hasUpdated) { // This should be impossible, as it would mean that the GPT message // history only contains tool call results or is empty entirely, both of // which should never happen as we always include a system prompt. throw new Error(`Malformed GPT message history!`); } } async queryGptForProposedToolCalls() { this.targetInspector.checkConnectedOrThrow(); // Initialise the AI query record immediately so the error handler always // has a record to update — no conditional check needed. let aiQuery = { id: (0, crypto_1.randomUUID)(), cleanScreenshotId: null, annotatedScreenshotId: null, interactableElements: null, error: null, startedAt: Date.now(), completedAt: null, }; this.aiQueries.push(aiQuery); try { // Discover and mark all interactable elements on the current screen/page. await this.targetInspector.attributeInteractableElements(); // Capture clean and annotated screenshots. Each inspector implementation // handles the platform-specific details (DOM injection vs server-side compositing). const screenshotBytes = await this.targetInspector.takeCleanScreenshot(); const cleanScreenshotId = await this.persistence.saveScreenShot(this.metadata.id, screenshotBytes); await this.targetInspector.annotateInteractableElements(); const annotatedScreenShotBytes = await this.targetInspector.takeAnnotatedScreenshot(); await this.targetInspector.removeAnnotations(); const annotatedScreenshotId = await this.persistence.saveScreenShot(this.metadata.id, annotatedScreenShotBytes); const interactableElements = await this.targetInspector.getAttributedInteractableElements(); // Fill in the remaining fields and persist so the frontend can display // the record immediately. aiQuery = { ...aiQuery, cleanScreenshotId, annotatedScreenshotId, interactableElements, }; this.aiQueries[this.aiQueries.length - 1] = aiQuery; await this.persistence .setAiQuery(this.metadata.id, aiQuery) .catch((err) => Logger_1.appLogger.error('Failed to persist AI query record', err)); const mainMessage = DonobuFlow.createMainUserMessage(this.targetInspector, interactableElements); // Give the LLM both the pre and post annotated screenshots. It can // use the clean screenshot to decide what it wants to do, then map it to // the appropriate annotated element on the annotated screenshot. const userMessage = { type: 'user', items: [ { type: 'jpeg', bytes: screenshotBytes }, { type: 'jpeg', bytes: annotatedScreenShotBytes }, mainMessage, ], }; this.gptMessages.push(userMessage); const messagesToSendToGpt = DonobuFlow.createOptimizedHistoryForGptCall(this.gptM