UNPKG

browser-x-mcp

Version:

AI-Powered Browser Automation with Advanced Form Testing - A Model Context Provider (MCP) server that enables intelligent browser automation with form testing, element extraction, and comprehensive logging

355 lines (305 loc) 12.5 kB
/** * Screenshot Analyzer Tool * Visual element detection and coordinate mapping for Browser[X]MCP */ import sharp from 'sharp'; import fs from 'fs/promises'; import path from 'path'; export class ScreenshotAnalyzer { constructor(options = {}) { this.screenshotPath = options.screenshotPath || './screenshots'; this.outputPath = options.outputPath || './analysis-output'; this.aiProvider = options.aiProvider || 'openrouter'; this.enableCache = options.enableCache || true; this.cache = new Map(); this.coordinates = new Map(); } /** * Analyze full screenshot for interactive elements * @param {string} screenshotBuffer - Screenshot buffer or path * @param {Object} options - Analysis options * @returns {Promise<Object>} Analysis results with coordinates */ async analyzeFullScreenshot(screenshotBuffer, options = {}) { try { const { width, height, channels } = await sharp(screenshotBuffer).metadata(); // Extract potential interactive elements using image processing const elements = await this.detectInteractiveElements(screenshotBuffer); // Map coordinates to elements const coordinateMap = await this.generateCoordinateMap(elements, { width, height }); // Optionally send to AI for semantic analysis if (options.useAI) { const aiAnalysis = await this.analyzeWithAI(screenshotBuffer, elements); return this.mergeAnalysisResults(coordinateMap, aiAnalysis); } return { timestamp: new Date().toISOString(), dimensions: { width, height, channels }, elements: coordinateMap, totalElements: elements.length, interactiveElements: elements.filter(el => el.interactive).length }; } catch (error) { throw new Error(`Screenshot analysis failed: ${error.message}`); } } /** * Analyze cropped screenshot for targeted interaction * @param {Buffer} croppedBuffer - Cropped screenshot buffer * @param {Object} originalCoordinates - Original screenshot coordinates * @returns {Promise<Object>} Targeted analysis with precise coordinates */ async analyzeCroppedScreenshot(croppedBuffer, originalCoordinates) { try { const { width, height } = await sharp(croppedBuffer).metadata(); const { x: offsetX, y: offsetY } = originalCoordinates; // Detect elements in cropped area const croppedElements = await this.detectInteractiveElements(croppedBuffer); // Adjust coordinates to original screenshot const adjustedElements = croppedElements.map(element => ({ ...element, coordinates: { x: element.coordinates.x + offsetX, y: element.coordinates.y + offsetY, width: element.coordinates.width, height: element.coordinates.height }, clickPoint: { x: element.clickPoint.x + offsetX, y: element.clickPoint.y + offsetY } })); return { timestamp: new Date().toISOString(), cropArea: originalCoordinates, dimensions: { width, height }, elements: adjustedElements, totalElements: adjustedElements.length }; } catch (error) { throw new Error(`Cropped screenshot analysis failed: ${error.message}`); } } /** * Generate precise click coordinates for AI * @param {Object} element - Element to click * @param {Object} options - Click options * @returns {Object} Optimized click coordinates */ async generateClickCoordinates(element, options = {}) { const { coordinates } = element; const { strategy = 'center', offset = { x: 0, y: 0 } } = options; let clickPoint; switch (strategy) { case 'center': clickPoint = { x: coordinates.x + (coordinates.width / 2) + offset.x, y: coordinates.y + (coordinates.height / 2) + offset.y }; break; case 'top-left': clickPoint = { x: coordinates.x + 5 + offset.x, y: coordinates.y + 5 + offset.y }; break; case 'custom': clickPoint = { x: coordinates.x + offset.x, y: coordinates.y + offset.y }; break; default: clickPoint = { x: coordinates.x + (coordinates.width / 2), y: coordinates.y + (coordinates.height / 2) }; } return { element: element, clickCoordinates: clickPoint, confidence: this.calculateClickConfidence(element, clickPoint), strategy: strategy }; } /** * Crop screenshot to specific area * @param {Buffer} originalBuffer - Original screenshot * @param {Object} cropArea - Area to crop {x, y, width, height} * @returns {Promise<Buffer>} Cropped screenshot buffer */ async cropScreenshot(originalBuffer, cropArea) { try { const { x, y, width, height } = cropArea; const croppedBuffer = await sharp(originalBuffer) .extract({ left: x, top: y, width, height }) .png() .toBuffer(); return croppedBuffer; } catch (error) { throw new Error(`Screenshot cropping failed: ${error.message}`); } } /** * Detect interactive elements using image processing * @param {Buffer} screenshotBuffer - Screenshot to analyze * @returns {Promise<Array>} Array of detected elements */ async detectInteractiveElements(screenshotBuffer) { // This is a placeholder for advanced image processing // In a real implementation, you would use computer vision libraries // like OpenCV, TensorFlow.js, or cloud vision APIs const elements = []; // Mock detection logic - replace with actual image processing const { width, height } = await sharp(screenshotBuffer).metadata(); // Simulate button detection elements.push({ type: 'button', coordinates: { x: 100, y: 150, width: 120, height: 40 }, clickPoint: { x: 160, y: 170 }, confidence: 0.95, interactive: true, text: 'Submit', color: { r: 66, g: 133, b: 244 } }); // Simulate input field detection elements.push({ type: 'input', coordinates: { x: 200, y: 100, width: 300, height: 30 }, clickPoint: { x: 350, y: 115 }, confidence: 0.88, interactive: true, placeholder: 'Enter text here', backgroundColor: { r: 255, g: 255, b: 255 } }); return elements; } /** * Generate coordinate map for elements * @param {Array} elements - Detected elements * @param {Object} dimensions - Screenshot dimensions * @returns {Promise<Object>} Coordinate mapping */ async generateCoordinateMap(elements, dimensions) { const map = {}; elements.forEach((element, index) => { const id = `element_${index}`; map[id] = { ...element, id, relativePosition: { x: element.coordinates.x / dimensions.width, y: element.coordinates.y / dimensions.height } }; }); return map; } /** * Analyze screenshot with AI for semantic understanding * @param {Buffer} screenshotBuffer - Screenshot to analyze * @param {Array} detectedElements - Pre-detected elements * @returns {Promise<Object>} AI analysis results */ async analyzeWithAI(screenshotBuffer, detectedElements) { // Placeholder for AI integration // This would send the screenshot to AI models for semantic analysis return { semanticElements: detectedElements.map(el => ({ ...el, semanticLabel: this.generateSemanticLabel(el), purpose: this.inferElementPurpose(el), actionable: true })), pageContext: { pageType: 'form', primaryAction: 'submit', confidence: 0.92 } }; } /** * Calculate click confidence for an element * @param {Object} element - Element to evaluate * @param {Object} clickPoint - Proposed click point * @returns {number} Confidence score (0-1) */ calculateClickConfidence(element, clickPoint) { let confidence = 0.5; // Boost confidence for interactive elements if (element.interactive) confidence += 0.3; // Boost confidence if click point is within element bounds const { x, y, width, height } = element.coordinates; if (clickPoint.x >= x && clickPoint.x <= x + width && clickPoint.y >= y && clickPoint.y <= y + height) { confidence += 0.2; } return Math.min(confidence, 1.0); } /** * Generate semantic label for element * @param {Object} element - Element to label * @returns {string} Semantic label */ generateSemanticLabel(element) { if (element.text) return element.text; if (element.placeholder) return element.placeholder; return `${element.type}_element`; } /** * Infer element purpose * @param {Object} element - Element to analyze * @returns {string} Inferred purpose */ inferElementPurpose(element) { if (element.type === 'button') return 'action'; if (element.type === 'input') return 'data_entry'; return 'interaction'; } /** * Merge analysis results from different sources * @param {Object} coordinateMap - Coordinate-based analysis * @param {Object} aiAnalysis - AI-based analysis * @returns {Object} Merged results */ mergeAnalysisResults(coordinateMap, aiAnalysis) { const merged = { ...coordinateMap }; // Enhance with AI insights Object.keys(merged).forEach(elementId => { const aiElement = aiAnalysis.semanticElements.find( el => el.coordinates.x === merged[elementId].coordinates.x ); if (aiElement) { merged[elementId] = { ...merged[elementId], semanticLabel: aiElement.semanticLabel, purpose: aiElement.purpose, aiConfidence: aiElement.confidence || 0.8 }; } }); return { elements: merged, pageContext: aiAnalysis.pageContext, analysisMethod: 'hybrid' }; } /** * Save analysis results to file * @param {Object} analysisResults - Results to save * @param {string} filename - Output filename * @returns {Promise<string>} Saved file path */ async saveAnalysisResults(analysisResults, filename) { try { await fs.mkdir(this.outputPath, { recursive: true }); const filePath = path.join(this.outputPath, `${filename}.json`); await fs.writeFile(filePath, JSON.stringify(analysisResults, null, 2)); return filePath; } catch (error) { throw new Error(`Failed to save analysis results: ${error.message}`); } } } // Export for use in other modules export default ScreenshotAnalyzer;