UNPKG

ocr-click-plugin

Version:

An Appium plugin that uses OCR (Optical Character Recognition) to find and click text elements on mobile device screens with AI-powered screen analysis

github.com/Jitu1888/ocr-click-plugin

Jitu1888/ocr-click-plugin

519 lines (518 loc) • 25.2 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.OCRClickPlugin = void 0; const base_plugin_1 = require("@appium/base-plugin"); const tesseract_js_1 = __importDefault(require("tesseract.js")); const fs = __importStar(require("fs")); const path = __importStar(require("path")); // Try to import Google Vertex AI, but handle gracefully if not available let VertexAI = null; try { const { VertexAI: VertexAIClass } = require('@google-cloud/vertexai'); VertexAI = VertexAIClass; console.log('✅ Google Cloud Vertex AI SDK loaded successfully'); } catch (error) { console.warn('⚠️ Google Cloud Vertex AI SDK not available. AI functionality will be disabled. Install with: npm install @google-cloud/vertexai'); } // Try to import Sharp, but handle gracefully if not available let sharp = null; try { sharp = require('sharp'); console.log('✅ Sharp image processing library loaded successfully'); } catch (error) { console.warn('⚠️ Sharp not available. Image enhancement will be disabled. Install with: SHARP_IGNORE_GLOBAL_LIBVIPS=1 npm install --include=optional sharp'); } const SOURCE_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/textclick'); const CHECK_TEXT_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/checktext'); const ASK_LLM_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/askllm'); const ASK_LLM_SIMPLE_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/askllm-simple'); const EXECUTE_URL_REGEX = new RegExp('/session/[^/]+/execute'); // Tesseract configuration for better accuracy const TESSERACT_CONFIG = { lang: 'eng', tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?-_@#$%^&*()', // Limit recognized characters tessedit_pageseg_mode: '6', // Assume uniform text block tessedit_do_invert: '0', preserve_interword_spaces: '1', tessjs_create_pdf: '0', tessjs_create_hocr: '0', tessjs_create_tsv: '0', }; // Minimum confidence threshold for word recognition const MIN_CONFIDENCE_THRESHOLD = 60; class OCRClickPlugin extends base_plugin_1.BasePlugin { shouldAvoidProxy(method, route, body) { // Handle plugin routes if (SOURCE_URL_REGEX.test(route) || CHECK_TEXT_URL_REGEX.test(route) || ASK_LLM_URL_REGEX.test(route) || ASK_LLM_SIMPLE_URL_REGEX.test(route)) { return true; } // Handle execute commands for mobile: textclick, mobile: checktext, and mobile: askllm if (EXECUTE_URL_REGEX.test(route) && (body === null || body === void 0 ? void 0 : body.script)) { const script = body.script; if (script === 'mobile: textclick' || script === 'mobile: checktext' || script === 'mobile: askllm' || script === 'mobile: askllm-simple') { return true; } } return false; } handle(next, driver, cmdName, ...args) { return __awaiter(this, void 0, void 0, function* () { // Handle execute commands for mobile commands if (cmdName === 'execute') { const [script, scriptArgs] = args; if (script === 'mobile: textclick') { const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {}; const { text, index = 0 } = params; if (!text) { throw new Error('Text parameter is required for mobile: textclick'); } return yield this.findAndClickText(next, driver, text, index); } if (script === 'mobile: checktext') { const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {}; const { text } = params; if (!text) { throw new Error('Text parameter is required for mobile: checktext'); } return yield this.checkTextPresent(next, driver, text); } if (script === 'mobile: askllm') { const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {}; const { instruction } = params; if (!instruction) { throw new Error('Instruction parameter is required for mobile: askllm'); } return yield this.askAI(next, driver, instruction); } if (script === 'mobile: askllm-simple') { const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {}; const { instruction } = params; if (!instruction) { throw new Error('Instruction parameter is required for mobile: askllm-simple'); } return yield this.askAISimple(next, driver, instruction); } } // Handle plugin-specific commands if (cmdName === 'findAndClickText') { const [text, index = 0] = args; return yield this.findAndClickText(next, driver, text, index); } if (cmdName === 'checkTextPresent') { const [text] = args; return yield this.checkTextPresent(next, driver, text); } if (cmdName === 'askAI') { const [instruction] = args; return yield this.askAI(next, driver, instruction); } if (cmdName === 'askAISimple') { const [instruction] = args; return yield this.askAISimple(next, driver, instruction); } return yield next(); }); } findAndClickText(next_1, driver_1, text_1) { return __awaiter(this, arguments, void 0, function* (next, driver, text, index = 0) { try { if (!driver.getScreenshot) return; // Step 1: Capture screenshot const screenshotBase64 = yield driver.getScreenshot(); console.log('Enhancing screenshot for better OCR results...'); // Step 2: Enhance the screenshot (if Sharp is available) const enhancedBase64Image = yield this.enhanceScreenshot(screenshotBase64); // Step 3: Process the enhanced screenshot with OCR console.log('Processing enhanced screenshot with OCR...'); const result = yield tesseract_js_1.default.recognize(Buffer.from(enhancedBase64Image, 'base64'), 'eng', Object.assign(Object.assign({}, TESSERACT_CONFIG), { logger: (m) => { if (m.status === 'recognizing text') { console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`); } } })); // Filter words by confidence threshold const words = result.data.words.filter(word => word.confidence >= MIN_CONFIDENCE_THRESHOLD); console.log('OCR result:', words.map(w => ({ text: w.text, confidence: w.confidence }))); // Step 4: Find all matches for the given text const matchingWords = words.filter(word => { const normalizedWord = word.text.toLowerCase().trim(); const normalizedSearchText = text.toLowerCase().trim(); return normalizedWord.includes(normalizedSearchText); }); if (matchingWords.length === 0) { throw new Error(`Text "${text}" not found in the screenshot`); } if (index < 0 || index >= matchingWords.length) { throw new Error(`Invalid index "${index}". Found ${matchingWords.length} matches for text "${text}".`); } // Step 5: Get the desired match based on index const targetWord = matchingWords[index]; const { x0, y0, x1, y1 } = targetWord.bbox; const centerX = (x0 + x1) / 2; const centerY = (y0 + y1) / 2; console.log(`Text "${targetWord.text}" found at coordinates: (${centerX}, ${centerY}) with confidence: ${targetWord.confidence}%`); if (!driver.performActions) return; // Step 6: Perform the click action yield driver.performActions([ { type: 'pointer', id: 'finger1', actions: [ { type: 'pointerMove', duration: 0, x: centerX, y: centerY }, { type: 'pointerDown', button: 0 }, { type: 'pause', duration: 100 }, { type: 'pointerUp', button: 0 }, ], }, ]); console.log(`Successfully clicked on text "${text}" at index ${index}`); return { success: true, message: `Clicked on text "${text}" at index ${index}`, totalMatches: matchingWords.length, confidence: targetWord.confidence, imageEnhanced: sharp !== null, }; } catch (err) { console.error('Error in findAndClickText:', err); throw err; } }); } checkTextPresent(next, driver, text) { return __awaiter(this, void 0, void 0, function* () { try { if (!driver.getScreenshot) return; // Step 1: Capture screenshot const screenshotBase64 = yield driver.getScreenshot(); console.log('Enhancing screenshot for OCR text detection...'); // Step 2: Enhance the screenshot (if Sharp is available) const enhancedBase64Image = yield this.enhanceScreenshot(screenshotBase64); // Step 3: Process the enhanced screenshot with OCR console.log('Processing enhanced screenshot with OCR...'); const result = yield tesseract_js_1.default.recognize(Buffer.from(enhancedBase64Image, 'base64'), 'eng', Object.assign(Object.assign({}, TESSERACT_CONFIG), { logger: (m) => { if (m.status === 'recognizing text') { console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`); } } })); // Filter words by confidence threshold const words = result.data.words.filter(word => word.confidence >= MIN_CONFIDENCE_THRESHOLD); console.log('OCR result:', words.map(w => ({ text: w.text, confidence: w.confidence }))); // Step 4: Find all matches for the given text const matchingWords = words.filter(word => { const normalizedWord = word.text.toLowerCase().trim(); const normalizedSearchText = text.toLowerCase().trim(); return normalizedWord.includes(normalizedSearchText); }); const isPresent = matchingWords.length > 0; // Prepare detailed match information const matches = matchingWords.map(word => ({ text: word.text, confidence: word.confidence, coordinates: { x: (word.bbox.x0 + word.bbox.x1) / 2, y: (word.bbox.y0 + word.bbox.y1) / 2 }, bbox: word.bbox })); console.log(`Text "${text}" detection result: ${isPresent ? 'FOUND' : 'NOT FOUND'} (${matchingWords.length} matches)`); return { success: true, isPresent: isPresent, totalMatches: matchingWords.length, searchText: text, matches: matches, imageEnhanced: sharp !== null, message: isPresent ? `Text "${text}" found with ${matchingWords.length} match(es)` : `Text "${text}" not found in the screenshot` }; } catch (err) { console.error('Error in checkTextPresent:', err); const errorMessage = err instanceof Error ? err.message : String(err); return { success: false, isPresent: false, totalMatches: 0, searchText: text, matches: [], imageEnhanced: sharp !== null, message: `Error checking text presence: ${errorMessage}`, error: errorMessage }; } }); } // Helper method to enhance the screenshot (with fallback if Sharp not available) enhanceScreenshot(base64Image) { return __awaiter(this, void 0, void 0, function* () { if (!sharp) { console.log('Sharp not available - using original image without enhancement'); return base64Image; } try { const imageBuffer = Buffer.from(base64Image, 'base64'); const enhancedBuffer = yield sharp(imageBuffer) .grayscale() // Convert to grayscale .normalize() // Normalize contrast .sharpen({ sigma: 1.5, m1: 1.5, m2: 2.0, x1: 2.0, y2: 10, y3: 20, }) .gamma(1.2) // Slightly increase gamma for better text contrast .median(1) // Remove noise .threshold(128) // Binary threshold for clearer text .toBuffer(); return enhancedBuffer.toString('base64'); } catch (err) { console.error('Error enhancing screenshot with Sharp, using original:', err); return base64Image; // Fallback to original image } }); } // New askAI method for Google Vision AI integration askAI(next, driver, instruction) { return __awaiter(this, void 0, void 0, function* () { try { console.log(`AI Instruction received: ${instruction}`); const takeANewScreenShot = true; const sessionId = driver.sessionId || 'unknown-session'; // Capture a screenshot and convert it to base64 const screenshotPath = yield this.getScreenshotPath(driver, sessionId, takeANewScreenShot); const base64Screenshot = fs.readFileSync(screenshotPath, 'base64'); // Send the instruction and screenshot to Google Vision AI const response = yield this.askGoogleVisionAI(instruction, base64Screenshot); return { success: true, instruction: instruction, response: response, message: 'AI analysis completed successfully' }; } catch (err) { console.error('Error in askAI:', err); const errorMessage = err instanceof Error ? err.message : String(err); return { success: false, instruction: instruction, response: null, message: `Error processing AI request: ${errorMessage}`, error: errorMessage }; } }); } // Simplified askAI method that returns only the AI response text askAISimple(next, driver, instruction) { return __awaiter(this, void 0, void 0, function* () { var _a, _b, _c, _d, _e; try { console.log(`AI Simple Instruction received: ${instruction}`); const takeANewScreenShot = true; const sessionId = driver.sessionId || 'unknown-session'; // Capture a screenshot and convert it to base64 const screenshotPath = yield this.getScreenshotPath(driver, sessionId, takeANewScreenShot); const base64Screenshot = fs.readFileSync(screenshotPath, 'base64'); // Send the instruction and screenshot to Google Vision AI const response = yield this.askGoogleVisionAI(instruction, base64Screenshot); // Extract the AI response text let aiResponseText = ((_e = (_d = (_c = (_b = (_a = response === null || response === void 0 ? void 0 : response.candidates) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.content) === null || _c === void 0 ? void 0 : _c.parts) === null || _d === void 0 ? void 0 : _d[0]) === null || _e === void 0 ? void 0 : _e.text) || ''; // Clean the response by removing ```json markers and extra whitespace aiResponseText = aiResponseText.replace(/^```json\s*/, '').replace(/\s*```$/, '').trim(); return aiResponseText; } catch (err) { console.error('Error in askAISimple:', err); const errorMessage = err instanceof Error ? err.message : String(err); throw new Error(`AI request failed: ${errorMessage}`); } }); } // Helper method to get screenshot path getScreenshotPath(driver_1, sessionId_1) { return __awaiter(this, arguments, void 0, function* (driver, sessionId, takeNewScreenshot = true) { if (!driver.getScreenshot) { throw new Error('Screenshot functionality not available'); } // Create screenshots directory if it doesn't exist const screenshotsDir = path.join(process.cwd(), 'screenshots'); if (!fs.existsSync(screenshotsDir)) { fs.mkdirSync(screenshotsDir, { recursive: true }); } const screenshotPath = path.join(screenshotsDir, `${sessionId}_${Date.now()}.png`); if (takeNewScreenshot) { const screenshotBase64 = yield driver.getScreenshot(); fs.writeFileSync(screenshotPath, screenshotBase64, 'base64'); } return screenshotPath; }); } // Google Vision AI integration askGoogleVisionAI(instruction, encodedImg) { return __awaiter(this, void 0, void 0, function* () { console.log(`Google Vision AI instruction received`); let response; try { // Retrieve Google Cloud environment variables const projectId = process.env.GOOGLE_PROJECT_ID; const location = process.env.GOOGLE_LOCATION; const model = process.env.GOOGLE_MODEL; // Validate environment variables if (!projectId || !location || !model) { throw new Error('Google Cloud environment variables are not set. Required: GOOGLE_PROJECT_ID, GOOGLE_LOCATION, GOOGLE_MODEL'); } // Call Vertex AI to process the instruction and image response = yield this.createNonStreamingMultipartContent(projectId, location, model, encodedImg, instruction); console.log("AI Response:", response); } catch (error) { console.error("Error processing the image or query:", error); throw error; } return response; }); } // Google Vertex AI integration createNonStreamingMultipartContent(projectId, location, model, encodedImg, instruction) { return __awaiter(this, void 0, void 0, function* () { if (!VertexAI) { throw new Error('Google Cloud Vertex AI SDK is not available. Please install it with: npm install @google-cloud/vertexai'); } try { // Initialize Vertex AI const vertexAI = new VertexAI({ project: projectId, location: location, }); // Get the generative model const generativeModel = vertexAI.getGenerativeModel({ model: model, }); // Prepare the content with proper structure const contents = [ { role: 'user', parts: [ { text: instruction, }, { inlineData: { data: encodedImg, mimeType: 'image/png', }, } ] } ]; // Generate content with proper request structure const request = { contents: contents, }; const response = yield generativeModel.generateContent(request); console.log('Vertex AI response received'); return response.response; } catch (error) { console.error('Error calling Vertex AI:', error); throw new Error(`Vertex AI API error: ${error instanceof Error ? error.message : String(error)}`); } }); } } exports.OCRClickPlugin = OCRClickPlugin; // Define a new method map for Appium commands OCRClickPlugin.newMethodMap = { '/session/:sessionId/appium/plugin/textclick': { POST: { command: 'findAndClickText', payloadParams: { required: ['text'], optional: ['index'], }, }, }, '/session/:sessionId/appium/plugin/checktext': { POST: { command: 'checkTextPresent', payloadParams: { required: ['text'], optional: [], }, }, }, '/session/:sessionId/appium/plugin/askllm': { POST: { command: 'askAI', payloadParams: { required: ['instruction'], optional: [], }, }, }, '/session/:sessionId/appium/plugin/askllm-simple': { POST: { command: 'askAISimple', payloadParams: { required: ['instruction'], optional: [], }, }, }, };