browser-x-mcp
Version:
AI-Powered Browser Automation with Advanced Form Testing - A Model Context Provider (MCP) server that enables intelligent browser automation with form testing, element extraction, and comprehensive logging
355 lines (305 loc) • 12.5 kB
JavaScript
/**
* Screenshot Analyzer Tool
* Visual element detection and coordinate mapping for Browser[X]MCP
*/
import sharp from 'sharp';
import fs from 'fs/promises';
import path from 'path';
export class ScreenshotAnalyzer {
constructor(options = {}) {
this.screenshotPath = options.screenshotPath || './screenshots';
this.outputPath = options.outputPath || './analysis-output';
this.aiProvider = options.aiProvider || 'openrouter';
this.enableCache = options.enableCache || true;
this.cache = new Map();
this.coordinates = new Map();
}
/**
* Analyze full screenshot for interactive elements
* @param {string} screenshotBuffer - Screenshot buffer or path
* @param {Object} options - Analysis options
* @returns {Promise<Object>} Analysis results with coordinates
*/
async analyzeFullScreenshot(screenshotBuffer, options = {}) {
try {
const { width, height, channels } = await sharp(screenshotBuffer).metadata();
// Extract potential interactive elements using image processing
const elements = await this.detectInteractiveElements(screenshotBuffer);
// Map coordinates to elements
const coordinateMap = await this.generateCoordinateMap(elements, { width, height });
// Optionally send to AI for semantic analysis
if (options.useAI) {
const aiAnalysis = await this.analyzeWithAI(screenshotBuffer, elements);
return this.mergeAnalysisResults(coordinateMap, aiAnalysis);
}
return {
timestamp: new Date().toISOString(),
dimensions: { width, height, channels },
elements: coordinateMap,
totalElements: elements.length,
interactiveElements: elements.filter(el => el.interactive).length
};
} catch (error) {
throw new Error(`Screenshot analysis failed: ${error.message}`);
}
}
/**
* Analyze cropped screenshot for targeted interaction
* @param {Buffer} croppedBuffer - Cropped screenshot buffer
* @param {Object} originalCoordinates - Original screenshot coordinates
* @returns {Promise<Object>} Targeted analysis with precise coordinates
*/
async analyzeCroppedScreenshot(croppedBuffer, originalCoordinates) {
try {
const { width, height } = await sharp(croppedBuffer).metadata();
const { x: offsetX, y: offsetY } = originalCoordinates;
// Detect elements in cropped area
const croppedElements = await this.detectInteractiveElements(croppedBuffer);
// Adjust coordinates to original screenshot
const adjustedElements = croppedElements.map(element => ({
...element,
coordinates: {
x: element.coordinates.x + offsetX,
y: element.coordinates.y + offsetY,
width: element.coordinates.width,
height: element.coordinates.height
},
clickPoint: {
x: element.clickPoint.x + offsetX,
y: element.clickPoint.y + offsetY
}
}));
return {
timestamp: new Date().toISOString(),
cropArea: originalCoordinates,
dimensions: { width, height },
elements: adjustedElements,
totalElements: adjustedElements.length
};
} catch (error) {
throw new Error(`Cropped screenshot analysis failed: ${error.message}`);
}
}
/**
* Generate precise click coordinates for AI
* @param {Object} element - Element to click
* @param {Object} options - Click options
* @returns {Object} Optimized click coordinates
*/
async generateClickCoordinates(element, options = {}) {
const { coordinates } = element;
const { strategy = 'center', offset = { x: 0, y: 0 } } = options;
let clickPoint;
switch (strategy) {
case 'center':
clickPoint = {
x: coordinates.x + (coordinates.width / 2) + offset.x,
y: coordinates.y + (coordinates.height / 2) + offset.y
};
break;
case 'top-left':
clickPoint = {
x: coordinates.x + 5 + offset.x,
y: coordinates.y + 5 + offset.y
};
break;
case 'custom':
clickPoint = {
x: coordinates.x + offset.x,
y: coordinates.y + offset.y
};
break;
default:
clickPoint = {
x: coordinates.x + (coordinates.width / 2),
y: coordinates.y + (coordinates.height / 2)
};
}
return {
element: element,
clickCoordinates: clickPoint,
confidence: this.calculateClickConfidence(element, clickPoint),
strategy: strategy
};
}
/**
* Crop screenshot to specific area
* @param {Buffer} originalBuffer - Original screenshot
* @param {Object} cropArea - Area to crop {x, y, width, height}
* @returns {Promise<Buffer>} Cropped screenshot buffer
*/
async cropScreenshot(originalBuffer, cropArea) {
try {
const { x, y, width, height } = cropArea;
const croppedBuffer = await sharp(originalBuffer)
.extract({ left: x, top: y, width, height })
.png()
.toBuffer();
return croppedBuffer;
} catch (error) {
throw new Error(`Screenshot cropping failed: ${error.message}`);
}
}
/**
* Detect interactive elements using image processing
* @param {Buffer} screenshotBuffer - Screenshot to analyze
* @returns {Promise<Array>} Array of detected elements
*/
async detectInteractiveElements(screenshotBuffer) {
// This is a placeholder for advanced image processing
// In a real implementation, you would use computer vision libraries
// like OpenCV, TensorFlow.js, or cloud vision APIs
const elements = [];
// Mock detection logic - replace with actual image processing
const { width, height } = await sharp(screenshotBuffer).metadata();
// Simulate button detection
elements.push({
type: 'button',
coordinates: { x: 100, y: 150, width: 120, height: 40 },
clickPoint: { x: 160, y: 170 },
confidence: 0.95,
interactive: true,
text: 'Submit',
color: { r: 66, g: 133, b: 244 }
});
// Simulate input field detection
elements.push({
type: 'input',
coordinates: { x: 200, y: 100, width: 300, height: 30 },
clickPoint: { x: 350, y: 115 },
confidence: 0.88,
interactive: true,
placeholder: 'Enter text here',
backgroundColor: { r: 255, g: 255, b: 255 }
});
return elements;
}
/**
* Generate coordinate map for elements
* @param {Array} elements - Detected elements
* @param {Object} dimensions - Screenshot dimensions
* @returns {Promise<Object>} Coordinate mapping
*/
async generateCoordinateMap(elements, dimensions) {
const map = {};
elements.forEach((element, index) => {
const id = `element_${index}`;
map[id] = {
...element,
id,
relativePosition: {
x: element.coordinates.x / dimensions.width,
y: element.coordinates.y / dimensions.height
}
};
});
return map;
}
/**
* Analyze screenshot with AI for semantic understanding
* @param {Buffer} screenshotBuffer - Screenshot to analyze
* @param {Array} detectedElements - Pre-detected elements
* @returns {Promise<Object>} AI analysis results
*/
async analyzeWithAI(screenshotBuffer, detectedElements) {
// Placeholder for AI integration
// This would send the screenshot to AI models for semantic analysis
return {
semanticElements: detectedElements.map(el => ({
...el,
semanticLabel: this.generateSemanticLabel(el),
purpose: this.inferElementPurpose(el),
actionable: true
})),
pageContext: {
pageType: 'form',
primaryAction: 'submit',
confidence: 0.92
}
};
}
/**
* Calculate click confidence for an element
* @param {Object} element - Element to evaluate
* @param {Object} clickPoint - Proposed click point
* @returns {number} Confidence score (0-1)
*/
calculateClickConfidence(element, clickPoint) {
let confidence = 0.5;
// Boost confidence for interactive elements
if (element.interactive) confidence += 0.3;
// Boost confidence if click point is within element bounds
const { x, y, width, height } = element.coordinates;
if (clickPoint.x >= x && clickPoint.x <= x + width &&
clickPoint.y >= y && clickPoint.y <= y + height) {
confidence += 0.2;
}
return Math.min(confidence, 1.0);
}
/**
* Generate semantic label for element
* @param {Object} element - Element to label
* @returns {string} Semantic label
*/
generateSemanticLabel(element) {
if (element.text) return element.text;
if (element.placeholder) return element.placeholder;
return `${element.type}_element`;
}
/**
* Infer element purpose
* @param {Object} element - Element to analyze
* @returns {string} Inferred purpose
*/
inferElementPurpose(element) {
if (element.type === 'button') return 'action';
if (element.type === 'input') return 'data_entry';
return 'interaction';
}
/**
* Merge analysis results from different sources
* @param {Object} coordinateMap - Coordinate-based analysis
* @param {Object} aiAnalysis - AI-based analysis
* @returns {Object} Merged results
*/
mergeAnalysisResults(coordinateMap, aiAnalysis) {
const merged = { ...coordinateMap };
// Enhance with AI insights
Object.keys(merged).forEach(elementId => {
const aiElement = aiAnalysis.semanticElements.find(
el => el.coordinates.x === merged[elementId].coordinates.x
);
if (aiElement) {
merged[elementId] = {
...merged[elementId],
semanticLabel: aiElement.semanticLabel,
purpose: aiElement.purpose,
aiConfidence: aiElement.confidence || 0.8
};
}
});
return {
elements: merged,
pageContext: aiAnalysis.pageContext,
analysisMethod: 'hybrid'
};
}
/**
* Save analysis results to file
* @param {Object} analysisResults - Results to save
* @param {string} filename - Output filename
* @returns {Promise<string>} Saved file path
*/
async saveAnalysisResults(analysisResults, filename) {
try {
await fs.mkdir(this.outputPath, { recursive: true });
const filePath = path.join(this.outputPath, `${filename}.json`);
await fs.writeFile(filePath, JSON.stringify(analysisResults, null, 2));
return filePath;
} catch (error) {
throw new Error(`Failed to save analysis results: ${error.message}`);
}
}
}
// Export for use in other modules
export default ScreenshotAnalyzer;