UNPKG

mcp-appium-visual

Version:

MCP Server for Appium mobile automation with visual recovery

535 lines 24.7 kB
import * as fs from "fs/promises"; import * as path from "path"; import sharp from "sharp"; // Import both the Jimp class and functions from our ESM wrapper import { Jimp, intToRGBA } from "./jimp-esm.js"; import * as Tesseract from "tesseract.js"; import pixelmatch from "pixelmatch"; import { PNG } from "pngjs"; /** * Image processing utilities for enhanced visual recovery */ export class ImageProcessor { /** * Performs template matching to find a smaller image within a larger one * @param screenshotPath Path to the full screenshot * @param templatePath Path to the template image to find * @param threshold Matching threshold (0.0 to 1.0) * @returns Coordinates of the match or null if not found */ static async findTemplateInImage(screenshotPath, templatePath, threshold = 0.8) { try { // Load both images using Jimp const screenshot = await Jimp.read(screenshotPath); const template = await Jimp.read(templatePath); const templateWidth = template.getWidth(); const templateHeight = template.getHeight(); const screenshotWidth = screenshot.getWidth(); const screenshotHeight = screenshot.getHeight(); // Calculate the maximum possible similarity (1.0) const maxDiff = templateWidth * templateHeight * 4 * 255; // 4 channels (RGBA) * max diff per channel let bestMatch = { x: 0, y: 0, diff: maxDiff }; // Sliding window approach to find the template for (let y = 0; y <= screenshotHeight - templateHeight; y += 2) { // Step by 2 pixels for faster performance for (let x = 0; x <= screenshotWidth - templateWidth; x += 2) { // Step by 2 pixels for faster performance let diff = 0; // Calculate pixel differences in the current window for (let ty = 0; ty < templateHeight; ty += 2) { for (let tx = 0; tx < templateWidth; tx += 2) { const screenshotPixel = intToRGBA(screenshot.getPixelColor(x + tx, y + ty)); const templatePixel = intToRGBA(template.getPixelColor(tx, ty)); // Calculate RGB difference diff += Math.abs(screenshotPixel.r - templatePixel.r); diff += Math.abs(screenshotPixel.g - templatePixel.g); diff += Math.abs(screenshotPixel.b - templatePixel.b); // Early termination if we've already exceeded the best match if (diff >= bestMatch.diff) { break; } } if (diff >= bestMatch.diff) { break; } } // Update best match if this position has a lower difference if (diff < bestMatch.diff) { bestMatch = { x, y, diff }; } } } // Convert diff to similarity score (0.0 to 1.0) const similarity = 1 - bestMatch.diff / maxDiff; // Return match if above threshold if (similarity >= threshold) { return { x: bestMatch.x + Math.floor(templateWidth / 2), y: bestMatch.y + Math.floor(templateHeight / 2), width: templateWidth, height: templateHeight, }; } return null; } catch (error) { console.error("Error in template matching:", error); return null; } } /** * Performs OCR on an image to extract text * @param imagePath Path to the image * @param options Configuration options * @returns Extracted text and bounding boxes */ static async performOCR(imagePath, options) { try { // Create a worker with language const worker = await Tesseract.createWorker(options?.lang || "eng"); let result; if (options?.rect) { // Process only a portion of the image const { left, top, width, height } = options.rect; result = await worker.recognize(imagePath, { rectangle: { left, top, width, height }, }); } else { // Process the entire image result = await worker.recognize(imagePath); } await worker.terminate(); // Transform the result into our expected format return { text: result.data.text, confidence: result.data.confidence, words: result.data.words.map((word) => ({ text: word.text, confidence: word.confidence, bbox: word.bbox, })), }; } catch (error) { console.error("Error performing OCR:", error); return { text: "", confidence: 0, words: [] }; } } /** * Compares two images and returns the difference as a percentage * @param image1Path Path to first image * @param image2Path Path to second image * @param options Comparison options * @returns Difference as percentage and diff image path */ static async compareImages(image1Path, image2Path, options) { try { const threshold = options?.threshold || 0.1; const outputDiffPath = options?.outputDiffPath || null; // Read images const img1 = PNG.sync.read(await fs.readFile(image1Path)); const img2 = PNG.sync.read(await fs.readFile(image2Path)); // Ensure both images are the same size if (img1.width !== img2.width || img1.height !== img2.height) { // Resize the second image to match the first const resizedImg2 = await sharp(image2Path) .resize(img1.width, img1.height) .png() .toBuffer(); // Convert the buffer back to PNG const img2Resized = PNG.sync.read(resizedImg2); // Create output buffer for diff const diff = new PNG({ width: img1.width, height: img1.height }); // Apply ignore regions by making them black in both images if (options?.ignoreRegions) { for (const region of options.ignoreRegions) { for (let y = region.y; y < region.y + region.height; y++) { for (let x = region.x; x < region.x + region.width; x++) { if (x < img1.width && y < img1.height) { const idx = (img1.width * y + x) << 2; img1.data[idx] = 0; img1.data[idx + 1] = 0; img1.data[idx + 2] = 0; img2Resized.data[idx] = 0; img2Resized.data[idx + 1] = 0; img2Resized.data[idx + 2] = 0; } } } } } // Compare images and get number of different pixels const numDiffPixels = pixelmatch(img1.data, img2Resized.data, diff.data, img1.width, img1.height, { threshold }); // Calculate difference percentage const diffPercentage = numDiffPixels / (img1.width * img1.height); // Save diff image if path provided let diffImagePath = null; if (outputDiffPath) { await fs.writeFile(outputDiffPath, PNG.sync.write(diff)); diffImagePath = outputDiffPath; } return { diffPercentage, diffImagePath }; } else { // Create output buffer for diff const diff = new PNG({ width: img1.width, height: img1.height }); // Apply ignore regions by making them black in both images if (options?.ignoreRegions) { for (const region of options.ignoreRegions) { for (let y = region.y; y < region.y + region.height; y++) { for (let x = region.x; x < region.x + region.width; x++) { if (x < img1.width && y < img1.height) { const idx = (img1.width * y + x) << 2; img1.data[idx] = 0; img1.data[idx + 1] = 0; img1.data[idx + 2] = 0; img2.data[idx] = 0; img2.data[idx + 1] = 0; img2.data[idx + 2] = 0; } } } } } // Compare images and get number of different pixels const numDiffPixels = pixelmatch(img1.data, img2.data, diff.data, img1.width, img1.height, { threshold }); // Calculate difference percentage const diffPercentage = numDiffPixels / (img1.width * img1.height); // Save diff image if path provided let diffImagePath = null; if (outputDiffPath) { await fs.writeFile(outputDiffPath, PNG.sync.write(diff)); diffImagePath = outputDiffPath; } return { diffPercentage, diffImagePath }; } } catch (error) { console.error("Error comparing images:", error); return { diffPercentage: 1, diffImagePath: null }; } } /** * Finds elements in an image based on visual characteristics * @param imagePath Path to the image * @returns Information about detected UI elements */ static async detectUIElements(imagePath) { try { // Load the image using Sharp const image = sharp(imagePath); const metadata = await image.metadata(); const { width = 0, height = 0 } = metadata; // Convert to grayscale for easier analysis const grayscale = await image.grayscale().toBuffer(); // Use edge detection to find potential UI elements const edges = await sharp(grayscale) .convolve({ width: 3, height: 3, kernel: [-1, -1, -1, -1, 8, -1, -1, -1, -1], }) .toBuffer(); // Perform OCR to identify text areas const ocrResult = await this.performOCR(imagePath); const elements = []; // Add text elements from OCR ocrResult.words.forEach((word) => { if (word.confidence > 70) { elements.push({ type: "text", confidence: word.confidence / 100, bbox: { x: word.bbox.x0, y: word.bbox.y0, width: word.bbox.x1 - word.bbox.x0, height: word.bbox.y1 - word.bbox.y0, }, }); } }); // Simple heuristic for buttons: look for rectangles with text inside // This is a very simplified approach - real button detection would be more complex const imageBuffer = await sharp(imagePath).toBuffer(); // Fix: Save to temporary file and read it with Jimp instead of using buffer directly const tempFilePath = path.join(path.dirname(imagePath), `temp_${Date.now()}.png`); await fs.writeFile(tempFilePath, imageBuffer); const jimpImg = await Jimp.read(tempFilePath); // Clean up temp file after use fs.unlink(tempFilePath).catch(() => { }); const potentialButtons = []; // Scan for rectangle-like structures with contrasting borders // Skip pixels for better performance const scanStep = 10; for (let y = 0; y < height - scanStep; y += scanStep) { for (let x = 0; x < width - scanStep; x += scanStep) { const centerColor = intToRGBA(jimpImg.getPixelColor(x, y)); let isRectangle = true; let right = x; let bottom = y; // Look for horizontal edge while (right < width - 1) { const rightColor = intToRGBA(jimpImg.getPixelColor(right + 1, y)); if (this.colorDifference(centerColor, rightColor) > 30) { break; } right += scanStep; if (right >= width - 1) break; } // Look for vertical edge while (bottom < height - 1) { const bottomColor = intToRGBA(jimpImg.getPixelColor(x, bottom + 1)); if (this.colorDifference(centerColor, bottomColor) > 30) { break; } bottom += scanStep; if (bottom >= height - 1) break; } // Check if we found a rectangle of reasonable size const rectWidth = right - x; const rectHeight = bottom - y; if (rectWidth > 60 && rectHeight > 30 && rectWidth < width / 2 && rectHeight < height / 2) { potentialButtons.push({ x, y, width: rectWidth, height: rectHeight, }); } } } // Add potential buttons to elements potentialButtons.forEach((button) => { // Check if this button contains text (from OCR) const hasText = ocrResult.words.some((word) => { return (word.bbox.x0 >= button.x && word.bbox.x1 <= button.x + button.width && word.bbox.y0 >= button.y && word.bbox.y1 <= button.y + button.height); }); elements.push({ type: hasText ? "button" : "rectangle", confidence: hasText ? 0.8 : 0.5, bbox: button, }); }); return elements; } catch (error) { console.error("Error detecting UI elements:", error); return []; } } /** * Calculate the difference between two colors * @param color1 First color * @param color2 Second color * @returns Difference value */ static colorDifference(color1, color2) { return (Math.abs(color1.r - color2.r) + Math.abs(color1.g - color2.g) + Math.abs(color1.b - color2.b)); } } /** * Visual recovery utilities for UI elements */ export class VisualRecovery { /** * Find an element by appearance when traditional locators fail * @param baseScreenshotPath Path to previous screenshot containing element * @param currentScreenshotPath Path to current screenshot * @param elementRegion Region of element in base screenshot * @returns Coordinates of element in new screenshot or null if not found */ static async recoverElementByAppearance(baseScreenshotPath, currentScreenshotPath, elementRegion) { try { // Extract the element image from the base screenshot const elementImage = await sharp(baseScreenshotPath) .extract({ left: elementRegion.x, top: elementRegion.y, width: elementRegion.width, height: elementRegion.height, }) .toBuffer(); // Create a temporary file for the element template const tempDir = path.dirname(baseScreenshotPath); const elementTemplatePath = path.join(tempDir, `temp_element_${Date.now()}.png`); await fs.writeFile(elementTemplatePath, elementImage); try { // Use template matching to find this element in the current screenshot const match = await ImageProcessor.findTemplateInImage(currentScreenshotPath, elementTemplatePath, 0.7); // Clean up the temporary file await fs.unlink(elementTemplatePath); return match; } catch (error) { // Clean up the temporary file even if there was an error try { await fs.unlink(elementTemplatePath); } catch { /* ignore cleanup errors */ } throw error; } } catch (error) { console.error("Error recovering element by appearance:", error); return null; } } /** * Find text in an image and return its location * @param screenshotPath Path to the screenshot * @param text Text to find * @param options Optional parameters * @returns Coordinates of the text or null if not found */ static async findTextInImage(screenshotPath, text, options) { try { const minConfidence = options?.minConfidence || 0.7; // Perform OCR on the whole image or just the search region let ocrResult; if (options?.searchRegion) { // Convert from x,y to left,top format const { x, y, width, height } = options.searchRegion; ocrResult = await ImageProcessor.performOCR(screenshotPath, { rect: { left: x, top: y, width: width, height: height, }, }); } else { ocrResult = await ImageProcessor.performOCR(screenshotPath); } // Normalize input text for comparison const normalizedSearchText = text.toLowerCase().trim(); // Find the word that best matches our search text let bestMatch = null; for (const word of ocrResult.words) { const normalizedWord = word.text.toLowerCase().trim(); // Check if this word matches our search text if ((normalizedWord === normalizedSearchText || normalizedWord.includes(normalizedSearchText) || normalizedSearchText.includes(normalizedWord)) && word.confidence / 100 >= minConfidence && (!bestMatch || word.confidence > bestMatch.confidence)) { bestMatch = word; } } // If we found a match, return its location if (bestMatch) { let x0 = bestMatch.bbox.x0; let y0 = bestMatch.bbox.y0; // If we were searching in a region, adjust coordinates if (options?.searchRegion) { x0 += options.searchRegion.x; y0 += options.searchRegion.y; } return { x: Math.round((bestMatch.bbox.x0 + bestMatch.bbox.x1) / 2), y: Math.round((bestMatch.bbox.y0 + bestMatch.bbox.y1) / 2), width: bestMatch.bbox.x1 - bestMatch.bbox.x0, height: bestMatch.bbox.y1 - bestMatch.bbox.y0, confidence: bestMatch.confidence / 100, }; } return null; } catch (error) { console.error("Error finding text in image:", error); return null; } } /** * Find a UI element by visual characteristics * @param screenshotPath Path to the screenshot * @param options Search options * @returns Element coordinates or null if not found */ static async findElementByVisualCharacteristics(screenshotPath, options) { try { // First, look for UI elements in the image const elements = await ImageProcessor.detectUIElements(screenshotPath); // Filter by specified constraints let filteredElements = elements; // Filter by element type if specified if (options.elementType && options.elementType !== "any") { filteredElements = filteredElements.filter((el) => el.type.toLowerCase().includes(options.elementType.toLowerCase())); } // Filter by region if specified if (options.region) { const { x, y, width, height } = options.region; filteredElements = filteredElements.filter((el) => { const elX = el.bbox.x; const elY = el.bbox.y; return elX >= x && elX <= x + width && elY >= y && elY <= y + height; }); } // Check for expected text if specified if (options.expectedText) { // Find this text in the image const textLocation = await this.findTextInImage(screenshotPath, options.expectedText); if (textLocation) { // Find elements that contain this text location filteredElements = filteredElements.filter((el) => { return (textLocation.x >= el.bbox.x && textLocation.x <= el.bbox.x + el.bbox.width && textLocation.y >= el.bbox.y && textLocation.y <= el.bbox.y + el.bbox.height); }); } else { // If we were specifically looking for text and didn't find it, return null return null; } } // Look for elements near specified text if (options.nearText) { const textLocation = await this.findTextInImage(screenshotPath, options.nearText); if (textLocation) { // Sort elements by proximity to this text filteredElements.sort((a, b) => { const distA = Math.sqrt(Math.pow(a.bbox.x - textLocation.x, 2) + Math.pow(a.bbox.y - textLocation.y, 2)); const distB = Math.sqrt(Math.pow(b.bbox.x - textLocation.x, 2) + Math.pow(b.bbox.y - textLocation.y, 2)); return distA - distB; }); } } // If we have any elements left after filtering, return the best match if (filteredElements.length > 0) { // Sort by confidence filteredElements.sort((a, b) => b.confidence - a.confidence); const best = filteredElements[0]; return { x: best.bbox.x + Math.floor(best.bbox.width / 2), y: best.bbox.y + Math.floor(best.bbox.height / 2), width: best.bbox.width, height: best.bbox.height, confidence: best.confidence, type: best.type, }; } return null; } catch (error) { console.error("Error finding element by visual characteristics:", error); return null; } } } //# sourceMappingURL=imageProcessor.js.map