UNPKG

@ejazullah/smart-browser-automation

Version:

A smart AI-driven browser automation library and REST API server using MCP (Model Context Protocol) and LangChain for multi-step task execution. Includes both programmatic library usage and HTTP API server for remote automation.

426 lines (367 loc) • 11.4 kB
import { Client } from '@modelcontextprotocol/sdk/client/index.js'; import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'; import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js'; import { ChatOpenAI } from '@langchain/openai'; import { tool } from '@langchain/core/tools'; import { z } from 'zod'; /** * Smart Browser Automation class for AI-driven multi-step browser tasks */ export class SmartBrowserAutomation { constructor(config) { this.config = { maxSteps: 10, temperature: 0.0, transportType: 'streamable', // 'streamable' or 'sse' ...config }; this.mcpClient = null; this.llm = null; this.transport = null; } /** * Initialize the automation system * @param {Object} llmConfig - LLM configuration (Hugging Face, Ollama, etc.) * @param {string} mcpEndpoint - MCP server endpoint * @param {string} driverUrl - WebDriver URL */ async initialize(llmConfig, mcpEndpoint, driverUrl) { // Initialize LLM this.llm = new ChatOpenAI({ model: llmConfig.model, apiKey: llmConfig.apiKey, temperature: this.config.temperature, maxTokens: 2048, quantization: "4bit", configuration: { baseURL: llmConfig.baseURL } }); // Initialize MCP Client this.mcpClient = new Client( { name: '@ejazullah/smart-browser-automation', version: '1.0.0', }, { capabilities: { prompts: {}, resources: {}, tools: {}, logging: {}, }, } ); // Setup transport based on configured type if (this.config.transportType === 'sse') { // Use SSE Transport this.transport = new SSEClientTransport(new URL(mcpEndpoint)); } else { // Use Streamable HTTP Transport (default) const encodedDriverUrl = encodeURIComponent(driverUrl); this.transport = new StreamableHTTPClientTransport( new URL(`${mcpEndpoint}?cdpEndpoint=${encodedDriverUrl}`) ); } // Connect to MCP server await this.mcpClient.connect(this.transport); } /** * Fetch available tools from MCP server */ async fetchTools() { try { const result = await this.mcpClient.listTools(); return result; } catch (error) { console.error("āŒ Error fetching tools:", error); throw error; } } /** * Execute a single tool directly without LLM * @param {string} toolName - Name of the tool to execute * @param {Object} toolArgs - Arguments for the tool * @param {Object} options - Additional options */ async callTool(toolName, toolArgs = {}, options = {}) { const { verbose = false } = options; if (!this.mcpClient) { throw new Error('MCP client not initialized. Call initialize() first.'); } try { if (verbose) { console.log(`šŸ”§ Calling tool: ${toolName}`); console.log(`šŸ“ Arguments:`, toolArgs); } const result = await this.mcpClient.callTool({ name: toolName, arguments: toolArgs, }); if (verbose) { console.log(`āœ… Tool executed successfully`); } return { success: true, toolName, arguments: toolArgs, result }; } catch (error) { if (verbose) { console.error(`āŒ Tool execution failed:`, error.message); } return { success: false, toolName, arguments: toolArgs, error: error.message }; } } /** * Convert JSON Schema to Zod schema */ jsonSchemaToZod(jsonSchema) { if (!jsonSchema || jsonSchema.type !== "object" || !jsonSchema.properties) { return z.object({}); } const shape = {}; for (const [key, value] of Object.entries(jsonSchema.properties)) { let zodType; switch (value.type) { case "string": zodType = z.string(); break; case "number": zodType = z.number(); break; case "integer": zodType = z.number().int(); break; case "boolean": zodType = z.boolean(); break; case "array": zodType = z.array(this.jsonSchemaToZod(value.items)); break; case "object": zodType = this.jsonSchemaToZod(value); break; default: zodType = z.any(); } if (!jsonSchema.required?.includes(key)) { zodType = zodType.optional(); zodType = zodType.nullable(); } shape[key] = zodType; } return z.object(shape); } /** * Execute a smart browser automation task * @param {string} taskDescription - The task to execute * @param {Object} options - Additional options */ async executeTask(taskDescription, options = {}) { const { verbose = true, systemPrompt, onProgress } = options; if (verbose) console.log("šŸ” Fetching available tools..."); // Send progress update if (onProgress) { onProgress({ type: 'fetching_tools', message: 'Fetching available tools...' }); } // Fetch tools const mcpTools = await this.fetchTools(); if (onProgress) { onProgress({ type: 'tools_fetched', message: `Found ${mcpTools.tools.length} available tools`, toolCount: mcpTools.tools.length }); } // if (verbose) { // console.log("āœ… Available Tools:"); // mcpTools.tools.forEach((tool) => { // console.log(`šŸ”Ø tool: ${tool.name}`); // }); // } // Transform MCP Tools to LangChain tools const langchainTools = mcpTools.tools.map((mcpTool) => { return tool(null, { name: mcpTool.name, description: mcpTool.description || "No description provided", schema: this.jsonSchemaToZod(mcpTool.inputSchema), }); }); // Bind tools to LLM const llmWithTools = this.llm.bindTools(langchainTools); // console.log(llmWithTools.kwargs.tools); // Log tool names and descriptions for debugging // const response = await llmWithTools.invoke("Run a specific tool, browser_navigate, https://duckduckgo.com/'"); // console.log("Tool invocation response:", response); // Setup conversation const messages = [ { role: "system", content: systemPrompt || "You are a smart AI agent that can perform multi-step browser automation tasks. You should complete the entire task step by step. When you get a tool result, analyze it and decide what the next action should be. Complete the full user request, don't stop after the first step." }, { role: "user", content: taskDescription }, ]; if (onProgress) { onProgress({ type: 'task_execution_started', task: taskDescription.substring(0, 200), maxSteps: this.config.maxSteps }); } let currentStep = 0; const results = []; // Execute multi-step task while (currentStep < this.config.maxSteps) { currentStep++; if (verbose) console.log(`\nšŸš€ Step ${currentStep}: Invoking LLM...`); if (onProgress) { onProgress({ type: 'step_started', step: currentStep, message: `Starting step ${currentStep}...` }); } if (onProgress) { onProgress({ type: 'llm_thinking', step: currentStep, message: 'LLM is analyzing and deciding next action...' }); } const llmOutput = await llmWithTools.invoke(messages); if (llmOutput.tool_calls && llmOutput.tool_calls.length > 0) { const toolCall = llmOutput.tool_calls[0]; if (verbose) { console.log(`šŸ”§ LLM wants to use tool: ${toolCall.name}`); console.log(`šŸ“ Tool arguments:`, toolCall.args); } if (onProgress) { onProgress({ type: 'tool_call', step: currentStep, toolName: toolCall.name, toolArgs: toolCall.args, message: `Executing tool: ${toolCall.name}` }); } try { // Execute tool const result = await this.mcpClient.callTool({ name: toolCall.name, arguments: toolCall.args, }); if (verbose) console.log("āœ… Tool executed successfully"); if (onProgress) { onProgress({ type: 'tool_result', step: currentStep, toolName: toolCall.name, success: true, message: `Tool ${toolCall.name} executed successfully` }); } results.push({ step: currentStep, tool: toolCall.name, arguments: toolCall.args, result: result }); // Add to conversation messages.push({ role: "assistant", content: llmOutput.content, tool_calls: llmOutput.tool_calls }); messages.push({ role: "tool", content: JSON.stringify(result), tool_call_id: toolCall.id }); } catch (toolError) { console.error(`āŒ Tool execution failed:`, toolError); if (onProgress) { onProgress({ type: 'tool_error', step: currentStep, toolName: toolCall.name, error: toolError.message, message: `Tool ${toolCall.name} failed: ${toolError.message}` }); } results.push({ step: currentStep, tool: toolCall.name, arguments: toolCall.args, result: toolError.message, isError: true }); // Add error to conversation messages.push({ role: "assistant", content: `Tool execution failed: ${toolError.message}` }); } if (onProgress) { onProgress({ type: 'step_completed', step: currentStep, message: `Step ${currentStep} completed` }); } } else { // Task completed if (verbose) console.log("šŸŽÆ Task completed! LLM response:", llmOutput.content); if (onProgress) { onProgress({ type: 'task_completed', step: currentStep, response: llmOutput.content, message: 'Task completed by LLM' }); } results.push({ step: currentStep, type: 'completion', response: llmOutput.content }); break; } } if (currentStep >= this.config.maxSteps) { if (verbose) console.log("āš ļø Max steps reached, stopping execution"); if (onProgress) { onProgress({ type: 'max_steps_reached', step: currentStep, message: `Maximum steps (${this.config.maxSteps}) reached` }); } } return { success: true, steps: currentStep, results: results, completed: currentStep < this.config.maxSteps }; } /** * Close the automation system */ async close() { if (this.mcpClient) { this.mcpClient.close(); } } }