UNPKG

donobu

Version:

Create browser automations with an LLM agent and replay them as Playwright scripts.

253 lines 11.6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.PageInteractionTracker = exports.InteractionEventSchema = void 0; const v4_1 = require("zod/v4"); const donobu_namespace_1 = require("../browser-side-scripts/donobu-namespace"); const page_interactions_tracker_1 = require("../browser-side-scripts/page-interactions-tracker"); const smart_selector_generator_1 = require("../browser-side-scripts/smart-selector-generator"); const ClickTool_1 = require("../tools/ClickTool"); const InputTextTool_1 = require("../tools/InputTextTool"); const PressKeyTool_1 = require("../tools/PressKeyTool"); const ReplayableInteraction_1 = require("../tools/ReplayableInteraction"); const Logger_1 = require("../utils/Logger"); const MiscUtils_1 = require("../utils/MiscUtils"); const PlaywrightUtils_1 = require("../utils/PlaywrightUtils"); /** * Zod schema for validating InteractionEvent objects. */ exports.InteractionEventSchema = v4_1.z.object({ // The type of interaction (e.g. "click", "keydown", etc.) type: v4_1.z.string(), // Optional key pressed during the interaction (e.g. "Enter") key: v4_1.z.string().optional(), // Array of CSS/Xpath selectors representing the target element involved selectors: v4_1.z.array(v4_1.z.string()), timestamp: v4_1.z.number(), x: v4_1.z.number().optional(), y: v4_1.z.number().optional(), dragDistance: v4_1.z.number().optional(), keyCode: v4_1.z.int().optional(), altKey: v4_1.z.boolean().optional(), ctrlKey: v4_1.z.boolean().optional(), metaKey: v4_1.z.boolean().optional(), shiftKey: v4_1.z.boolean().optional(), }); /** * Tracks direct user actions that have occurred in the web browser. * This differs from normal Donobu flows which use GPT to analyze pages and propose actions. * This tracker records direct user actions for reliable replay and GPT visibility. */ class PageInteractionTracker { static async register(host, browserContext) { const instance = new PageInteractionTracker(host); try { await browserContext.exposeBinding(instance.name(), instance.call.bind(instance)); await browserContext.addInitScript(donobu_namespace_1.installDonobuNamespace); await browserContext.addInitScript(smart_selector_generator_1.installSmartSelectorGenerator); browserContext.on('page', async (page) => { page.on('framenavigated', async (frame) => { try { await frame.evaluate(page_interactions_tracker_1.installPageInteractionsTracker); } catch (error) { if (!PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(error)) { Logger_1.appLogger.error('Error when evaluating installPageInteractionsTracker', error); } } }); }); } catch (_error) { // This means the binding has already been registered. } return instance; } constructor(host) { this.host = host; } name() { return PageInteractionTracker.NAME; } /** * Handles tracking of page interactions. * * Assumes: * - args is a single-element array with first element being JSON convertible to InteractionEvent */ async call(source, ...args) { const flowState = this.host.metadata.state; if (flowState === 'PAUSED' || flowState === 'WAITING_ON_USER_FOR_NEXT_ACTION') { Logger_1.appLogger.debug(`Event: ${JSON.stringify(args[0])}`); const event = exports.InteractionEventSchema.parse(args[0]); await this.handleSyntheticToolCall(source, event); } } async handleSyntheticToolCall(source, event) { switch (event.type) { case 'click': { await this.handleClick(source, event); break; } case 'keydown': { await this.handleKeyDown(source, event); break; } } } async handleClick(source, event) { const timestamp = new Date().getTime(); const pageUrl = source.page.url(); const selectorForReplay = await this.buildSelectorForReplay(source, event); const postCallImageId = await this.savePageScreenshot(source.page); const toolCall = { id: MiscUtils_1.MiscUtils.createAdHocToolCallId(), toolName: ClickTool_1.ClickTool.NAME, parameters: { selector: selectorForReplay, }, outcome: { isSuccessful: true, forLlm: 'Clicked the element.', metadata: selectorForReplay, }, postCallImageId: postCallImageId, page: pageUrl, startedAt: timestamp, completedAt: timestamp, }; this.host.invokedToolCalls.push(toolCall); await this.host.persistence.setToolCall(this.host.metadata.id, toolCall); } async handleKeyDown(source, event) { const key = event.key; if (key.length === 1 || key === 'Enter' || key === 'Backspace') { const timestamp = new Date().getTime(); const pageUrl = source.page.url(); const selectorForReplay = await this.buildSelectorForReplay(source, event); const lastToolCall = this.host.invokedToolCalls.at(this.host.invokedToolCalls.length - 1); if (lastToolCall && this.isCandidateForToolCallMerging(lastToolCall, key, pageUrl, selectorForReplay)) { let rewrittenToolCall = null; // Do we need to convert the tool type itself? if (lastToolCall.toolName === PressKeyTool_1.PressKeyTool.NAME) { // Was the last key pressed a normal key press? if (lastToolCall.parameters.key.length === 1) { // The previous key was not a control character like 'Enter', // so we can convert the pressKey tool to an inputText tool. // (The inputText tool only supports 'Enter' to finalize the // input, not in the middle of input.) const amendedText = lastToolCall.parameters.key + (key === 'Enter' ? '' : key); rewrittenToolCall = { ...lastToolCall, toolName: InputTextTool_1.InputTextTool.NAME, parameters: { text: amendedText, append: true, finalizeWithSubmit: key === 'Enter', }, outcome: { isSuccessful: true, forLlm: `Inputted '${amendedText}'`, metadata: lastToolCall.outcome.metadata, }, completedAt: timestamp, }; } } else if (lastToolCall.toolName === InputTextTool_1.InputTextTool.NAME) { // Was the last key pressed a normal key press? if (lastToolCall.parameters.finalizeWithSubmit !== true) { const amendedText = lastToolCall.parameters.text + (key === 'Enter' ? '' : key); rewrittenToolCall = { ...lastToolCall, parameters: { text: amendedText, append: true, finalizeWithSubmit: key === 'Enter', }, outcome: { isSuccessful: true, forLlm: `Inputted '${amendedText}'`, metadata: lastToolCall.outcome.metadata, }, completedAt: timestamp, }; } } // Phew! if (rewrittenToolCall) { this.host.invokedToolCalls[this.host.invokedToolCalls.length - 1] = rewrittenToolCall; await this.host.persistence.setToolCall(this.host.metadata.id, rewrittenToolCall); return; } } // If we are here, it means we did not squish together adjacent key events. const postCallImageId = await this.savePageScreenshot(source.page); const toolCall = { id: MiscUtils_1.MiscUtils.createAdHocToolCallId(), toolName: PressKeyTool_1.PressKeyTool.NAME, parameters: { selector: selectorForReplay, key: key, }, outcome: { isSuccessful: true, forLlm: `Pressed the '${key}' key at the element.`, metadata: selectorForReplay, }, postCallImageId: postCallImageId, page: pageUrl, startedAt: timestamp, completedAt: timestamp, }; this.host.invokedToolCalls.push(toolCall); await this.host.persistence.setToolCall(this.host.metadata.id, toolCall); } } async buildSelectorForReplay(source, event) { const elementSelectors = event.selectors.slice(0, ReplayableInteraction_1.ReplayableInteraction.MAX_SELECTOR_FAILOVERS); const frameSelector = source.frame.url() === source.page.url() ? null : (await PlaywrightUtils_1.PlaywrightUtils.generateSelectors(await source.frame.frameElement()))[0]; return { element: elementSelectors, frame: frameSelector, }; } isCandidateForToolCallMerging(lastToolCall, key, pageUrl, selectorForReplay) { // Is the key itself merge-able? if (key === 'Backspace') { return false; } // Is this a merge-able tool call at all? if (lastToolCall.toolName !== PressKeyTool_1.PressKeyTool.NAME && lastToolCall.toolName !== InputTextTool_1.InputTextTool.NAME) { return false; } // Are we on the same page as we were before? if (lastToolCall.page !== pageUrl) { return false; } // ...and the same frame? if (lastToolCall.outcome.metadata?.frame !== selectorForReplay.frame) { return false; } // ...and for the same target element? if (JSON.stringify(lastToolCall.outcome.metadata?.element) !== JSON.stringify(selectorForReplay.element)) { return false; } // Merge-able! return true; } async savePageScreenshot(page) { const postCallImage = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(page); return await this.host.persistence.saveScreenShot(this.host.metadata.id, postCallImage); } } exports.PageInteractionTracker = PageInteractionTracker; // If this name is updated, the installPageInteractionsTracker script must also be updated. PageInteractionTracker.NAME = '__donobuTrackInteraction'; //# sourceMappingURL=PageInteractionTracker.js.map