@ejazullah/smart-browser-automation
Version:
A smart AI-driven browser automation library and REST API server using MCP (Model Context Protocol) and LangChain for multi-step task execution. Includes both programmatic library usage and HTTP API server for remote automation.
426 lines (367 loc) ⢠11.4 kB
JavaScript
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js';
import { ChatOpenAI } from '@langchain/openai';
import { tool } from '@langchain/core/tools';
import { z } from 'zod';
/**
* Smart Browser Automation class for AI-driven multi-step browser tasks
*/
export class SmartBrowserAutomation {
constructor(config) {
this.config = {
maxSteps: 10,
temperature: 0.0,
transportType: 'streamable', // 'streamable' or 'sse'
...config
};
this.mcpClient = null;
this.llm = null;
this.transport = null;
}
/**
* Initialize the automation system
* @param {Object} llmConfig - LLM configuration (Hugging Face, Ollama, etc.)
* @param {string} mcpEndpoint - MCP server endpoint
* @param {string} driverUrl - WebDriver URL
*/
async initialize(llmConfig, mcpEndpoint, driverUrl) {
// Initialize LLM
this.llm = new ChatOpenAI({
model: llmConfig.model,
apiKey: llmConfig.apiKey,
temperature: this.config.temperature,
maxTokens: 2048,
quantization: "4bit",
configuration: {
baseURL: llmConfig.baseURL
}
});
// Initialize MCP Client
this.mcpClient = new Client(
{
name: '@ejazullah/smart-browser-automation',
version: '1.0.0',
},
{
capabilities: {
prompts: {},
resources: {},
tools: {},
logging: {},
},
}
);
// Setup transport based on configured type
if (this.config.transportType === 'sse') {
// Use SSE Transport
this.transport = new SSEClientTransport(new URL(mcpEndpoint));
} else {
// Use Streamable HTTP Transport (default)
const encodedDriverUrl = encodeURIComponent(driverUrl);
this.transport = new StreamableHTTPClientTransport(
new URL(`${mcpEndpoint}?cdpEndpoint=${encodedDriverUrl}`)
);
}
// Connect to MCP server
await this.mcpClient.connect(this.transport);
}
/**
* Fetch available tools from MCP server
*/
async fetchTools() {
try {
const result = await this.mcpClient.listTools();
return result;
} catch (error) {
console.error("ā Error fetching tools:", error);
throw error;
}
}
/**
* Execute a single tool directly without LLM
* @param {string} toolName - Name of the tool to execute
* @param {Object} toolArgs - Arguments for the tool
* @param {Object} options - Additional options
*/
async callTool(toolName, toolArgs = {}, options = {}) {
const { verbose = false } = options;
if (!this.mcpClient) {
throw new Error('MCP client not initialized. Call initialize() first.');
}
try {
if (verbose) {
console.log(`š§ Calling tool: ${toolName}`);
console.log(`š Arguments:`, toolArgs);
}
const result = await this.mcpClient.callTool({
name: toolName,
arguments: toolArgs,
});
if (verbose) {
console.log(`ā
Tool executed successfully`);
}
return {
success: true,
toolName,
arguments: toolArgs,
result
};
} catch (error) {
if (verbose) {
console.error(`ā Tool execution failed:`, error.message);
}
return {
success: false,
toolName,
arguments: toolArgs,
error: error.message
};
}
}
/**
* Convert JSON Schema to Zod schema
*/
jsonSchemaToZod(jsonSchema) {
if (!jsonSchema || jsonSchema.type !== "object" || !jsonSchema.properties) {
return z.object({});
}
const shape = {};
for (const [key, value] of Object.entries(jsonSchema.properties)) {
let zodType;
switch (value.type) {
case "string":
zodType = z.string();
break;
case "number":
zodType = z.number();
break;
case "integer":
zodType = z.number().int();
break;
case "boolean":
zodType = z.boolean();
break;
case "array":
zodType = z.array(this.jsonSchemaToZod(value.items));
break;
case "object":
zodType = this.jsonSchemaToZod(value);
break;
default:
zodType = z.any();
}
if (!jsonSchema.required?.includes(key)) {
zodType = zodType.optional();
zodType = zodType.nullable();
}
shape[key] = zodType;
}
return z.object(shape);
}
/**
* Execute a smart browser automation task
* @param {string} taskDescription - The task to execute
* @param {Object} options - Additional options
*/
async executeTask(taskDescription, options = {}) {
const { verbose = true, systemPrompt, onProgress } = options;
if (verbose) console.log("š Fetching available tools...");
// Send progress update
if (onProgress) {
onProgress({
type: 'fetching_tools',
message: 'Fetching available tools...'
});
}
// Fetch tools
const mcpTools = await this.fetchTools();
if (onProgress) {
onProgress({
type: 'tools_fetched',
message: `Found ${mcpTools.tools.length} available tools`,
toolCount: mcpTools.tools.length
});
}
// if (verbose) {
// console.log("ā
Available Tools:");
// mcpTools.tools.forEach((tool) => {
// console.log(`šØ tool: ${tool.name}`);
// });
// }
// Transform MCP Tools to LangChain tools
const langchainTools = mcpTools.tools.map((mcpTool) => {
return tool(null, {
name: mcpTool.name,
description: mcpTool.description || "No description provided",
schema: this.jsonSchemaToZod(mcpTool.inputSchema),
});
});
// Bind tools to LLM
const llmWithTools = this.llm.bindTools(langchainTools);
// console.log(llmWithTools.kwargs.tools); // Log tool names and descriptions for debugging
// const response = await llmWithTools.invoke("Run a specific tool, browser_navigate, https://duckduckgo.com/'");
// console.log("Tool invocation response:", response);
// Setup conversation
const messages = [
{
role: "system",
content: systemPrompt || "You are a smart AI agent that can perform multi-step browser automation tasks. You should complete the entire task step by step. When you get a tool result, analyze it and decide what the next action should be. Complete the full user request, don't stop after the first step."
},
{
role: "user",
content: taskDescription
},
];
if (onProgress) {
onProgress({
type: 'task_execution_started',
task: taskDescription.substring(0, 200),
maxSteps: this.config.maxSteps
});
}
let currentStep = 0;
const results = [];
// Execute multi-step task
while (currentStep < this.config.maxSteps) {
currentStep++;
if (verbose) console.log(`\nš Step ${currentStep}: Invoking LLM...`);
if (onProgress) {
onProgress({
type: 'step_started',
step: currentStep,
message: `Starting step ${currentStep}...`
});
}
if (onProgress) {
onProgress({
type: 'llm_thinking',
step: currentStep,
message: 'LLM is analyzing and deciding next action...'
});
}
const llmOutput = await llmWithTools.invoke(messages);
if (llmOutput.tool_calls && llmOutput.tool_calls.length > 0) {
const toolCall = llmOutput.tool_calls[0];
if (verbose) {
console.log(`š§ LLM wants to use tool: ${toolCall.name}`);
console.log(`š Tool arguments:`, toolCall.args);
}
if (onProgress) {
onProgress({
type: 'tool_call',
step: currentStep,
toolName: toolCall.name,
toolArgs: toolCall.args,
message: `Executing tool: ${toolCall.name}`
});
}
try {
// Execute tool
const result = await this.mcpClient.callTool({
name: toolCall.name,
arguments: toolCall.args,
});
if (verbose) console.log("ā
Tool executed successfully");
if (onProgress) {
onProgress({
type: 'tool_result',
step: currentStep,
toolName: toolCall.name,
success: true,
message: `Tool ${toolCall.name} executed successfully`
});
}
results.push({
step: currentStep,
tool: toolCall.name,
arguments: toolCall.args,
result: result
});
// Add to conversation
messages.push({
role: "assistant",
content: llmOutput.content,
tool_calls: llmOutput.tool_calls
});
messages.push({
role: "tool",
content: JSON.stringify(result),
tool_call_id: toolCall.id
});
} catch (toolError) {
console.error(`ā Tool execution failed:`, toolError);
if (onProgress) {
onProgress({
type: 'tool_error',
step: currentStep,
toolName: toolCall.name,
error: toolError.message,
message: `Tool ${toolCall.name} failed: ${toolError.message}`
});
}
results.push({
step: currentStep,
tool: toolCall.name,
arguments: toolCall.args,
result: toolError.message,
isError: true
});
// Add error to conversation
messages.push({
role: "assistant",
content: `Tool execution failed: ${toolError.message}`
});
}
if (onProgress) {
onProgress({
type: 'step_completed',
step: currentStep,
message: `Step ${currentStep} completed`
});
}
} else {
// Task completed
if (verbose) console.log("šÆ Task completed! LLM response:", llmOutput.content);
if (onProgress) {
onProgress({
type: 'task_completed',
step: currentStep,
response: llmOutput.content,
message: 'Task completed by LLM'
});
}
results.push({
step: currentStep,
type: 'completion',
response: llmOutput.content
});
break;
}
}
if (currentStep >= this.config.maxSteps) {
if (verbose) console.log("ā ļø Max steps reached, stopping execution");
if (onProgress) {
onProgress({
type: 'max_steps_reached',
step: currentStep,
message: `Maximum steps (${this.config.maxSteps}) reached`
});
}
}
return {
success: true,
steps: currentStep,
results: results,
completed: currentStep < this.config.maxSteps
};
}
/**
* Close the automation system
*/
async close() {
if (this.mcpClient) {
this.mcpClient.close();
}
}
}