@hyperbrowser/agent
Version:
Hyperbrowsers Web Agent
512 lines (511 loc) • 23.4 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.runAgentTask = void 0;
const fs_1 = __importDefault(require("fs"));
const perf_hooks_1 = require("perf_hooks");
const dom_cache_1 = require("../../context-providers/a11y-dom/dom-cache");
const cdp_1 = require("../../cdp");
const retry_1 = require("../../utils/retry");
const sleep_1 = require("../../utils/sleep");
const waitForSettledDOM_1 = require("../../utils/waitForSettledDOM");
const dom_capture_1 = require("../shared/dom-capture");
const runtime_context_1 = require("../shared/runtime-context");
const types_1 = require("../../types/index");
const types_2 = require("../../types/index");
const error_1 = require("../error");
const builder_1 = require("../messages/builder");
const system_prompt_1 = require("../messages/system-prompt");
const zod_1 = require("zod");
const actions_1 = require("../actions");
const jimp_1 = require("jimp");
// DomChunkAggregator logic moved to shared/dom-capture.ts
const READ_ONLY_ACTIONS = new Set(["wait", "extract", "complete"]);
const writeFrameGraphSnapshot = async (page, dir, debug) => {
try {
const cdpClient = await (0, cdp_1.getCDPClient)(page);
const frameManager = (0, cdp_1.getOrCreateFrameContextManager)(cdpClient);
frameManager.setDebug(debug);
const data = frameManager.toJSON();
fs_1.default.writeFileSync(`${dir}/frames.json`, JSON.stringify(data, null, 2));
}
catch (error) {
if (debug) {
console.warn("[FrameContext] Failed to write frame graph:", error);
}
}
};
const compositeScreenshot = async (page, overlay) => {
// Use CDP screenshot - faster, doesn't wait for fonts
const cdpClient = await (0, cdp_1.getCDPClient)(page);
const client = await cdpClient.acquireSession("screenshot");
const { data } = await client.send("Page.captureScreenshot", {
format: "png",
});
const [baseImage, overlayImage] = await Promise.all([
jimp_1.Jimp.read(Buffer.from(data, "base64")),
jimp_1.Jimp.read(Buffer.from(overlay, "base64")),
]);
// If dimensions don't match (can happen with viewport: null or DPR), scale overlay to match screenshot
if (overlayImage.bitmap.width !== baseImage.bitmap.width ||
overlayImage.bitmap.height !== baseImage.bitmap.height) {
console.log(`[Screenshot] Dimension mismatch - overlay: ${overlayImage.bitmap.width}x${overlayImage.bitmap.height}, screenshot: ${baseImage.bitmap.width}x${baseImage.bitmap.height}, scaling overlay...`);
overlayImage.resize({
w: baseImage.bitmap.width,
h: baseImage.bitmap.height,
});
}
baseImage.composite(overlayImage, 0, 0);
const buffer = await baseImage.getBuffer("image/png");
return buffer.toString("base64");
};
const getActionSchema = (actions) => {
const zodDefs = actions.map((action) => zod_1.z.object({
type: zod_1.z.literal(action.type),
params: action.actionParams,
}));
if (zodDefs.length === 0) {
throw new Error("No actions registered for agent");
}
if (zodDefs.length === 1) {
const [single] = zodDefs;
const schema = zod_1.z.union([single, single]);
return schema;
}
const [first, second, ...rest] = zodDefs;
const schema = zod_1.z.union([first, second, ...rest]);
return schema;
};
const getActionHandler = (actions, type) => {
const foundAction = actions.find((actions) => actions.type === type);
if (foundAction) {
return foundAction.run;
}
else {
throw new actions_1.ActionNotFoundError(type);
}
};
const runAction = async (action, domState, page, ctx) => {
const actionStart = perf_hooks_1.performance.now();
const actionCtx = {
domState,
page,
tokenLimit: ctx.tokenLimit,
llm: ctx.llm,
debugDir: ctx.debugDir,
debug: ctx.debug,
mcpClient: ctx.mcpClient || undefined,
variables: Object.values(ctx.variables),
cdpActions: ctx.cdpActions,
invalidateDomCache: () => (0, dom_cache_1.markDomSnapshotDirty)(page),
};
if (ctx.cdpActions) {
const { cdpClient, frameContextManager } = await (0, runtime_context_1.initializeRuntimeContext)(page, ctx.debug);
actionCtx.cdp = {
resolveElement: cdp_1.resolveElement,
dispatchCDPAction: cdp_1.dispatchCDPAction,
client: cdpClient,
preferScriptBoundingBox: !!ctx.debugDir,
frameContextManager,
debug: ctx.debug,
};
}
const actionType = action.type;
const actionHandler = getActionHandler(ctx.actions, action.type);
if (!actionHandler) {
return {
success: false,
message: `Unknown action type: ${actionType}`,
};
}
try {
const result = await actionHandler(actionCtx, action.params);
logPerf(ctx.debug, `[Perf][runAction][${action.type}]`, actionStart);
return result;
}
catch (error) {
logPerf(ctx.debug, `[Perf][runAction][${action.type}] (error)`, actionStart);
return {
success: false,
message: `Action ${action.type} failed: ${error}`,
};
}
};
function logPerf(debug, label, start) {
if (!debug)
return;
const duration = perf_hooks_1.performance.now() - start;
console.log(`${label} took ${Math.round(duration)}ms`);
}
const runAgentTask = async (ctx, taskState, params) => {
const taskStart = perf_hooks_1.performance.now();
const taskId = taskState.id;
const debugDir = params?.debugDir || `debug/${taskId}`;
if (ctx.debug) {
console.log(`Debugging task ${taskId} in ${debugDir}`);
}
if (!taskState) {
throw new error_1.HyperagentError(`Task ${taskId} not found`);
}
taskState.status = types_2.TaskStatus.RUNNING;
if (!ctx.llm) {
throw new error_1.HyperagentError("LLM not initialized");
}
// Use the new structured output interface
const actionSchema = getActionSchema(ctx.actions);
// V1 always uses visual mode with full system prompt
const systemPrompt = system_prompt_1.SYSTEM_PROMPT;
const baseMsgs = [
{ role: "system", content: systemPrompt },
];
let output = "";
let page = taskState.startingPage;
const useDomCache = params?.useDomCache === true;
const enableDomStreaming = params?.enableDomStreaming === true;
// Track schema validation errors across steps
if (!ctx.schemaErrors) {
ctx.schemaErrors = [];
}
const navigationDirtyHandler = () => {
(0, dom_cache_1.markDomSnapshotDirty)(page);
};
const setupDomListeners = (p) => {
p.on("framenavigated", navigationDirtyHandler);
p.on("framedetached", navigationDirtyHandler);
p.on("load", navigationDirtyHandler);
};
const cleanupDomListeners = (p) => {
p.off?.("framenavigated", navigationDirtyHandler);
p.off?.("framedetached", navigationDirtyHandler);
p.off?.("load", navigationDirtyHandler);
};
setupDomListeners(page);
let currStep = 0;
let consecutiveFailuresOrWaits = 0;
const MAX_CONSECUTIVE_FAILURES_OR_WAITS = 5;
let lastOverlayKey = null;
let lastScreenshotBase64;
try {
// Initialize context at the start of the task
await (0, runtime_context_1.initializeRuntimeContext)(page, ctx.debug);
while (true) {
// Check for page context switch
if (ctx.activePage) {
const newPage = await ctx.activePage();
if (newPage && newPage !== page) {
if (ctx.debug) {
console.log(`[Agent] Switching active page context to ${newPage.url()}`);
}
cleanupDomListeners(page);
page = newPage;
setupDomListeners(page);
await (0, runtime_context_1.initializeRuntimeContext)(page, ctx.debug);
(0, dom_cache_1.markDomSnapshotDirty)(page);
}
}
// Status Checks
const status = taskState.status;
if (status === types_2.TaskStatus.PAUSED) {
await (0, sleep_1.sleep)(100);
continue;
}
if (types_1.endTaskStatuses.has(status)) {
break;
}
if (params?.maxSteps && currStep >= params.maxSteps) {
taskState.status = types_2.TaskStatus.CANCELLED;
break;
}
const debugStepDir = `${debugDir}/step-${currStep}`;
const stepStart = perf_hooks_1.performance.now();
const stepMetrics = {
stepIndex: currStep,
};
if (ctx.debug) {
fs_1.default.mkdirSync(debugStepDir, { recursive: true });
}
// Get A11y DOM State (visual mode optional, default false for performance)
let domState = null;
const domChunks = null;
try {
const domFetchStart = perf_hooks_1.performance.now();
await (0, waitForSettledDOM_1.waitForSettledDOM)(page);
domState = await (0, dom_capture_1.captureDOMState)(page, {
useCache: useDomCache,
debug: ctx.debug,
enableVisualMode: params?.enableVisualMode ?? false,
debugStepDir: ctx.debug ? debugStepDir : undefined,
enableStreaming: enableDomStreaming,
onFrameChunk: enableDomStreaming
? () => {
// captureDOMState handles aggregation
}
: undefined,
});
const domDuration = perf_hooks_1.performance.now() - domFetchStart;
stepMetrics.domCaptureMs = Math.round(domDuration);
}
catch (error) {
if (ctx.debug) {
console.log("Failed to retrieve DOM state after 3 retries. Failing task.", error);
}
taskState.status = types_2.TaskStatus.FAILED;
taskState.error = "Failed to retrieve DOM state";
break;
}
if (!domState) {
taskState.status = types_2.TaskStatus.FAILED;
taskState.error = "Failed to retrieve DOM state";
break;
}
// If visual mode enabled, composite screenshot with overlay
let trimmedScreenshot;
if (domState.visualOverlay) {
const overlayKey = domState.visualOverlay;
if (overlayKey === lastOverlayKey && lastScreenshotBase64) {
trimmedScreenshot = lastScreenshotBase64;
}
else {
trimmedScreenshot = await compositeScreenshot(page, overlayKey);
lastOverlayKey = overlayKey;
lastScreenshotBase64 = trimmedScreenshot;
}
}
else {
lastOverlayKey = null;
lastScreenshotBase64 = undefined;
}
// Store Dom State for Debugging
if (ctx.debug) {
fs_1.default.mkdirSync(debugDir, { recursive: true });
fs_1.default.writeFileSync(`${debugStepDir}/elems.txt`, domState.domState);
if (trimmedScreenshot) {
fs_1.default.writeFileSync(`${debugStepDir}/screenshot.png`, Buffer.from(trimmedScreenshot, "base64"));
}
}
if (domChunks) {
domState.domState = domChunks;
}
// Build Agent Step Messages
let msgs = await (0, builder_1.buildAgentStepMessages)(baseMsgs, taskState.steps, taskState.task, page, domState, trimmedScreenshot, Object.values(ctx.variables));
// Append accumulated schema errors from previous steps
if (ctx.schemaErrors && ctx.schemaErrors.length > 0) {
const errorSummary = ctx.schemaErrors
.slice(-3) // Only keep last 3 errors to avoid context bloat
.map((err) => `Step ${err.stepIndex}: ${err.error}`)
.join("\n");
msgs = [
...msgs,
{
role: "user",
content: `Note: Previous steps had schema validation errors. Learn from these:\n${errorSummary}\n\nEnsure your response follows the exact schema structure.`,
},
];
}
// Store Agent Step Messages for Debugging
if (ctx.debug) {
fs_1.default.writeFileSync(`${debugStepDir}/msgs.json`, JSON.stringify(msgs, null, 2));
}
// Invoke LLM with structured output
const agentOutput = await (async () => {
const maxAttempts = 3;
let currentMsgs = msgs;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const structuredResult = await (0, retry_1.retry)({
func: () => (async () => {
const llmStart = perf_hooks_1.performance.now();
const result = await ctx.llm.invokeStructured({
schema: (0, types_1.AgentOutputFn)(actionSchema),
options: {
temperature: 0,
},
actions: ctx.actions,
}, currentMsgs);
const llmDuration = perf_hooks_1.performance.now() - llmStart;
logPerf(ctx.debug, `[Perf][runAgentTask] llm.invokeStructured(step ${currStep})`, llmStart);
stepMetrics.llmMs = Math.round(llmDuration);
return result;
})(),
onError: (...args) => {
console.error("[LLM][StructuredOutput] Retry error", ...args);
},
});
if (structuredResult.parsed) {
return structuredResult.parsed;
}
const providerId = ctx.llm?.getProviderId?.() ?? "unknown-provider";
const modelId = ctx.llm?.getModelId?.() ?? "unknown-model";
// Try to get detailed Zod validation error
let validationError = "Unknown validation error";
if (structuredResult.rawText) {
try {
const parsed = JSON.parse(structuredResult.rawText);
(0, types_1.AgentOutputFn)(actionSchema).parse(parsed);
}
catch (zodError) {
if (zodError instanceof zod_1.z.ZodError) {
validationError = JSON.stringify(zodError.issues, null, 2);
}
else {
validationError = String(zodError);
}
}
}
console.error(`[LLM][StructuredOutput] Failed to parse response from ${providerId} (${modelId}). Raw response: ${structuredResult.rawText?.trim() || "<empty>"} (attempt ${attempt + 1}/${maxAttempts})`);
// Store error for cross-step learning
ctx.schemaErrors?.push({
stepIndex: currStep,
error: validationError,
rawResponse: structuredResult.rawText || "",
});
// Append error feedback for next retry
if (attempt < maxAttempts - 1) {
currentMsgs = [
...currentMsgs,
{
role: "assistant",
content: structuredResult.rawText || "Failed to generate response",
},
{
role: "user",
content: `The previous response failed validation. Zod validation errors:\n\`\`\`json\n${validationError}\n\`\`\`\n\nPlease fix these errors and return valid structured output matching the schema.`,
},
];
}
}
throw new Error("Failed to get structured output from LLM");
})();
params?.debugOnAgentOutput?.(agentOutput);
// Status Checks
const statusAfterLLM = taskState.status;
if (statusAfterLLM === types_2.TaskStatus.PAUSED) {
await (0, sleep_1.sleep)(100);
continue;
}
if (types_1.endTaskStatuses.has(statusAfterLLM)) {
break;
}
// Run single action
const action = agentOutput.action;
// Handle complete action specially
if (action.type === "complete") {
taskState.status = types_2.TaskStatus.COMPLETED;
const actionDefinition = ctx.actions.find((actionDefinition) => actionDefinition.type === "complete");
if (actionDefinition) {
output =
(await actionDefinition.completeAction?.(action.params)) ??
"No complete action found";
}
else {
output = "No complete action found";
}
}
// Execute the action
const actionExecStart = perf_hooks_1.performance.now();
const actionOutput = await runAction(action, domState, page, ctx);
const actionDuration = perf_hooks_1.performance.now() - actionExecStart;
logPerf(ctx.debug, `[Perf][runAgentTask] runAction(step ${currStep})`, actionExecStart);
stepMetrics.actionMs = Math.round(actionDuration);
stepMetrics.actionType = action.type;
stepMetrics.actionSuccess = actionOutput.success;
if (actionOutput.debug &&
typeof actionOutput.debug === "object" &&
"timings" in actionOutput.debug &&
actionOutput.debug.timings &&
typeof actionOutput.debug.timings === "object") {
stepMetrics.actionTimings = actionOutput.debug.timings;
}
if (!READ_ONLY_ACTIONS.has(action.type)) {
(0, dom_cache_1.markDomSnapshotDirty)(page);
}
// Check action result and handle retry logic
if (action.type === "wait") {
// Wait action - increment counter
consecutiveFailuresOrWaits++;
if (consecutiveFailuresOrWaits >= MAX_CONSECUTIVE_FAILURES_OR_WAITS) {
taskState.status = types_2.TaskStatus.FAILED;
taskState.error = `Agent is stuck: waited or failed ${MAX_CONSECUTIVE_FAILURES_OR_WAITS} consecutive times without making progress.`;
const step = {
idx: currStep,
agentOutput: agentOutput,
actionOutput,
};
taskState.steps.push(step);
await params?.onStep?.(step);
break;
}
if (ctx.debug) {
console.log(`[agent] Wait action (${consecutiveFailuresOrWaits}/${MAX_CONSECUTIVE_FAILURES_OR_WAITS}): ${actionOutput.message}`);
}
}
else if (!actionOutput.success) {
// Action failed - increment counter
consecutiveFailuresOrWaits++;
if (consecutiveFailuresOrWaits >= MAX_CONSECUTIVE_FAILURES_OR_WAITS) {
taskState.status = types_2.TaskStatus.FAILED;
taskState.error = `Agent is stuck: waited or failed ${MAX_CONSECUTIVE_FAILURES_OR_WAITS} consecutive times without making progress. Last error: ${actionOutput.message}`;
const step = {
idx: currStep,
agentOutput: agentOutput,
actionOutput,
};
taskState.steps.push(step);
await params?.onStep?.(step);
break;
}
if (ctx.debug) {
console.log(`[agent] Action failed (${consecutiveFailuresOrWaits}/${MAX_CONSECUTIVE_FAILURES_OR_WAITS}): ${actionOutput.message}`);
}
}
else {
// Success - reset counter
consecutiveFailuresOrWaits = 0;
}
// Wait for DOM to settle after action
const waitStats = await (0, waitForSettledDOM_1.waitForSettledDOM)(page);
stepMetrics.waitForSettledMs = Math.round(waitStats.durationMs);
stepMetrics.waitForSettled = {
totalMs: Math.round(waitStats.durationMs),
lifecycleMs: Math.round(waitStats.lifecycleMs),
networkMs: Math.round(waitStats.networkMs),
requestsSeen: waitStats.requestsSeen,
peakInflight: waitStats.peakInflight,
reason: waitStats.resolvedByTimeout ? "timeout" : "quiet",
forcedDrops: waitStats.forcedDrops,
};
const step = {
idx: currStep,
agentOutput,
actionOutput,
};
taskState.steps.push(step);
await params?.onStep?.(step);
currStep = currStep + 1;
const totalDuration = perf_hooks_1.performance.now() - stepStart;
logPerf(ctx.debug, `[Perf][runAgentTask] step ${currStep - 1} total`, stepStart);
stepMetrics.totalMs = Math.round(totalDuration);
if (ctx.debug) {
await writeFrameGraphSnapshot(page, debugStepDir, ctx.debug);
fs_1.default.writeFileSync(`${debugStepDir}/stepOutput.json`, JSON.stringify(step, null, 2));
fs_1.default.writeFileSync(`${debugStepDir}/perf.json`, JSON.stringify(stepMetrics, null, 2));
}
}
logPerf(ctx.debug, `[Perf][runAgentTask] Task ${taskId}`, taskStart);
}
finally {
cleanupDomListeners(page);
}
const taskOutput = {
status: taskState.status,
steps: taskState.steps,
output,
};
if (ctx.debug) {
fs_1.default.writeFileSync(`${debugDir}/taskOutput.json`, JSON.stringify(taskOutput, null, 2));
}
await params?.onComplete?.(taskOutput);
return taskOutput;
};
exports.runAgentTask = runAgentTask;