donobu
Version:
Create browser automations with an LLM agent and replay them as Playwright scripts.
253 lines • 11.6 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.PageInteractionTracker = exports.InteractionEventSchema = void 0;
const v4_1 = require("zod/v4");
const donobu_namespace_1 = require("../browser-side-scripts/donobu-namespace");
const page_interactions_tracker_1 = require("../browser-side-scripts/page-interactions-tracker");
const smart_selector_generator_1 = require("../browser-side-scripts/smart-selector-generator");
const ClickTool_1 = require("../tools/ClickTool");
const InputTextTool_1 = require("../tools/InputTextTool");
const PressKeyTool_1 = require("../tools/PressKeyTool");
const ReplayableInteraction_1 = require("../tools/ReplayableInteraction");
const Logger_1 = require("../utils/Logger");
const MiscUtils_1 = require("../utils/MiscUtils");
const PlaywrightUtils_1 = require("../utils/PlaywrightUtils");
/**
* Zod schema for validating InteractionEvent objects.
*/
exports.InteractionEventSchema = v4_1.z.object({
// The type of interaction (e.g. "click", "keydown", etc.)
type: v4_1.z.string(),
// Optional key pressed during the interaction (e.g. "Enter")
key: v4_1.z.string().optional(),
// Array of CSS/Xpath selectors representing the target element involved
selectors: v4_1.z.array(v4_1.z.string()),
timestamp: v4_1.z.number(),
x: v4_1.z.number().optional(),
y: v4_1.z.number().optional(),
dragDistance: v4_1.z.number().optional(),
keyCode: v4_1.z.int().optional(),
altKey: v4_1.z.boolean().optional(),
ctrlKey: v4_1.z.boolean().optional(),
metaKey: v4_1.z.boolean().optional(),
shiftKey: v4_1.z.boolean().optional(),
});
/**
* Tracks direct user actions that have occurred in the web browser.
* This differs from normal Donobu flows which use GPT to analyze pages and propose actions.
* This tracker records direct user actions for reliable replay and GPT visibility.
*/
class PageInteractionTracker {
static async register(host, browserContext) {
const instance = new PageInteractionTracker(host);
try {
await browserContext.exposeBinding(instance.name(), instance.call.bind(instance));
await browserContext.addInitScript(donobu_namespace_1.installDonobuNamespace);
await browserContext.addInitScript(smart_selector_generator_1.installSmartSelectorGenerator);
browserContext.on('page', async (page) => {
page.on('framenavigated', async (frame) => {
try {
await frame.evaluate(page_interactions_tracker_1.installPageInteractionsTracker);
}
catch (error) {
if (!PlaywrightUtils_1.PlaywrightUtils.isPageClosedError(error)) {
Logger_1.appLogger.error('Error when evaluating installPageInteractionsTracker', error);
}
}
});
});
}
catch (_error) {
// This means the binding has already been registered.
}
return instance;
}
constructor(host) {
this.host = host;
}
name() {
return PageInteractionTracker.NAME;
}
/**
* Handles tracking of page interactions.
*
* Assumes:
* - args is a single-element array with first element being JSON convertible to InteractionEvent
*/
async call(source, ...args) {
const flowState = this.host.metadata.state;
if (flowState === 'PAUSED' ||
flowState === 'WAITING_ON_USER_FOR_NEXT_ACTION') {
Logger_1.appLogger.debug(`Event: ${JSON.stringify(args[0])}`);
const event = exports.InteractionEventSchema.parse(args[0]);
await this.handleSyntheticToolCall(source, event);
}
}
async handleSyntheticToolCall(source, event) {
switch (event.type) {
case 'click': {
await this.handleClick(source, event);
break;
}
case 'keydown': {
await this.handleKeyDown(source, event);
break;
}
}
}
async handleClick(source, event) {
const timestamp = new Date().getTime();
const pageUrl = source.page.url();
const selectorForReplay = await this.buildSelectorForReplay(source, event);
const postCallImageId = await this.savePageScreenshot(source.page);
const toolCall = {
id: MiscUtils_1.MiscUtils.createAdHocToolCallId(),
toolName: ClickTool_1.ClickTool.NAME,
parameters: {
selector: selectorForReplay,
},
outcome: {
isSuccessful: true,
forLlm: 'Clicked the element.',
metadata: selectorForReplay,
},
postCallImageId: postCallImageId,
page: pageUrl,
startedAt: timestamp,
completedAt: timestamp,
};
this.host.invokedToolCalls.push(toolCall);
await this.host.persistence.setToolCall(this.host.metadata.id, toolCall);
}
async handleKeyDown(source, event) {
const key = event.key;
if (key.length === 1 || key === 'Enter' || key === 'Backspace') {
const timestamp = new Date().getTime();
const pageUrl = source.page.url();
const selectorForReplay = await this.buildSelectorForReplay(source, event);
const lastToolCall = this.host.invokedToolCalls.at(this.host.invokedToolCalls.length - 1);
if (lastToolCall &&
this.isCandidateForToolCallMerging(lastToolCall, key, pageUrl, selectorForReplay)) {
let rewrittenToolCall = null;
// Do we need to convert the tool type itself?
if (lastToolCall.toolName === PressKeyTool_1.PressKeyTool.NAME) {
// Was the last key pressed a normal key press?
if (lastToolCall.parameters.key.length === 1) {
// The previous key was not a control character like 'Enter',
// so we can convert the pressKey tool to an inputText tool.
// (The inputText tool only supports 'Enter' to finalize the
// input, not in the middle of input.)
const amendedText = lastToolCall.parameters.key + (key === 'Enter' ? '' : key);
rewrittenToolCall = {
...lastToolCall,
toolName: InputTextTool_1.InputTextTool.NAME,
parameters: {
text: amendedText,
append: true,
finalizeWithSubmit: key === 'Enter',
},
outcome: {
isSuccessful: true,
forLlm: `Inputted '${amendedText}'`,
metadata: lastToolCall.outcome.metadata,
},
completedAt: timestamp,
};
}
}
else if (lastToolCall.toolName === InputTextTool_1.InputTextTool.NAME) {
// Was the last key pressed a normal key press?
if (lastToolCall.parameters.finalizeWithSubmit !== true) {
const amendedText = lastToolCall.parameters.text + (key === 'Enter' ? '' : key);
rewrittenToolCall = {
...lastToolCall,
parameters: {
text: amendedText,
append: true,
finalizeWithSubmit: key === 'Enter',
},
outcome: {
isSuccessful: true,
forLlm: `Inputted '${amendedText}'`,
metadata: lastToolCall.outcome.metadata,
},
completedAt: timestamp,
};
}
}
// Phew!
if (rewrittenToolCall) {
this.host.invokedToolCalls[this.host.invokedToolCalls.length - 1] =
rewrittenToolCall;
await this.host.persistence.setToolCall(this.host.metadata.id, rewrittenToolCall);
return;
}
}
// If we are here, it means we did not squish together adjacent key events.
const postCallImageId = await this.savePageScreenshot(source.page);
const toolCall = {
id: MiscUtils_1.MiscUtils.createAdHocToolCallId(),
toolName: PressKeyTool_1.PressKeyTool.NAME,
parameters: {
selector: selectorForReplay,
key: key,
},
outcome: {
isSuccessful: true,
forLlm: `Pressed the '${key}' key at the element.`,
metadata: selectorForReplay,
},
postCallImageId: postCallImageId,
page: pageUrl,
startedAt: timestamp,
completedAt: timestamp,
};
this.host.invokedToolCalls.push(toolCall);
await this.host.persistence.setToolCall(this.host.metadata.id, toolCall);
}
}
async buildSelectorForReplay(source, event) {
const elementSelectors = event.selectors.slice(0, ReplayableInteraction_1.ReplayableInteraction.MAX_SELECTOR_FAILOVERS);
const frameSelector = source.frame.url() === source.page.url()
? null
: (await PlaywrightUtils_1.PlaywrightUtils.generateSelectors(await source.frame.frameElement()))[0];
return {
element: elementSelectors,
frame: frameSelector,
};
}
isCandidateForToolCallMerging(lastToolCall, key, pageUrl, selectorForReplay) {
// Is the key itself merge-able?
if (key === 'Backspace') {
return false;
}
// Is this a merge-able tool call at all?
if (lastToolCall.toolName !== PressKeyTool_1.PressKeyTool.NAME &&
lastToolCall.toolName !== InputTextTool_1.InputTextTool.NAME) {
return false;
}
// Are we on the same page as we were before?
if (lastToolCall.page !== pageUrl) {
return false;
}
// ...and the same frame?
if (lastToolCall.outcome.metadata?.frame !== selectorForReplay.frame) {
return false;
}
// ...and for the same target element?
if (JSON.stringify(lastToolCall.outcome.metadata?.element) !==
JSON.stringify(selectorForReplay.element)) {
return false;
}
// Merge-able!
return true;
}
async savePageScreenshot(page) {
const postCallImage = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(page);
return await this.host.persistence.saveScreenShot(this.host.metadata.id, postCallImage);
}
}
exports.PageInteractionTracker = PageInteractionTracker;
// If this name is updated, the installPageInteractionsTracker script must also be updated.
PageInteractionTracker.NAME = '__donobuTrackInteraction';
//# sourceMappingURL=PageInteractionTracker.js.map