ocr-click-plugin
Version:
An Appium plugin that uses OCR (Optical Character Recognition) to find and click text elements on mobile device screens with AI-powered screen analysis
519 lines (518 loc) • 25.2 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.OCRClickPlugin = void 0;
const base_plugin_1 = require("@appium/base-plugin");
const tesseract_js_1 = __importDefault(require("tesseract.js"));
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
// Try to import Google Vertex AI, but handle gracefully if not available
let VertexAI = null;
try {
const { VertexAI: VertexAIClass } = require('@google-cloud/vertexai');
VertexAI = VertexAIClass;
console.log('✅ Google Cloud Vertex AI SDK loaded successfully');
}
catch (error) {
console.warn('⚠️ Google Cloud Vertex AI SDK not available. AI functionality will be disabled. Install with: npm install @google-cloud/vertexai');
}
// Try to import Sharp, but handle gracefully if not available
let sharp = null;
try {
sharp = require('sharp');
console.log('✅ Sharp image processing library loaded successfully');
}
catch (error) {
console.warn('⚠️ Sharp not available. Image enhancement will be disabled. Install with: SHARP_IGNORE_GLOBAL_LIBVIPS=1 npm install --include=optional sharp');
}
const SOURCE_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/textclick');
const CHECK_TEXT_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/checktext');
const ASK_LLM_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/askllm');
const ASK_LLM_SIMPLE_URL_REGEX = new RegExp('/session/[^/]+/appium/plugin/askllm-simple');
const EXECUTE_URL_REGEX = new RegExp('/session/[^/]+/execute');
// Tesseract configuration for better accuracy
const TESSERACT_CONFIG = {
lang: 'eng',
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?-_@#$%^&*()', // Limit recognized characters
tessedit_pageseg_mode: '6', // Assume uniform text block
tessedit_do_invert: '0',
preserve_interword_spaces: '1',
tessjs_create_pdf: '0',
tessjs_create_hocr: '0',
tessjs_create_tsv: '0',
};
// Minimum confidence threshold for word recognition
const MIN_CONFIDENCE_THRESHOLD = 60;
class OCRClickPlugin extends base_plugin_1.BasePlugin {
shouldAvoidProxy(method, route, body) {
// Handle plugin routes
if (SOURCE_URL_REGEX.test(route) || CHECK_TEXT_URL_REGEX.test(route) || ASK_LLM_URL_REGEX.test(route) || ASK_LLM_SIMPLE_URL_REGEX.test(route)) {
return true;
}
// Handle execute commands for mobile: textclick, mobile: checktext, and mobile: askllm
if (EXECUTE_URL_REGEX.test(route) && (body === null || body === void 0 ? void 0 : body.script)) {
const script = body.script;
if (script === 'mobile: textclick' || script === 'mobile: checktext' || script === 'mobile: askllm' || script === 'mobile: askllm-simple') {
return true;
}
}
return false;
}
handle(next, driver, cmdName, ...args) {
return __awaiter(this, void 0, void 0, function* () {
// Handle execute commands for mobile commands
if (cmdName === 'execute') {
const [script, scriptArgs] = args;
if (script === 'mobile: textclick') {
const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {};
const { text, index = 0 } = params;
if (!text) {
throw new Error('Text parameter is required for mobile: textclick');
}
return yield this.findAndClickText(next, driver, text, index);
}
if (script === 'mobile: checktext') {
const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {};
const { text } = params;
if (!text) {
throw new Error('Text parameter is required for mobile: checktext');
}
return yield this.checkTextPresent(next, driver, text);
}
if (script === 'mobile: askllm') {
const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {};
const { instruction } = params;
if (!instruction) {
throw new Error('Instruction parameter is required for mobile: askllm');
}
return yield this.askAI(next, driver, instruction);
}
if (script === 'mobile: askllm-simple') {
const params = (scriptArgs === null || scriptArgs === void 0 ? void 0 : scriptArgs[0]) || {};
const { instruction } = params;
if (!instruction) {
throw new Error('Instruction parameter is required for mobile: askllm-simple');
}
return yield this.askAISimple(next, driver, instruction);
}
}
// Handle plugin-specific commands
if (cmdName === 'findAndClickText') {
const [text, index = 0] = args;
return yield this.findAndClickText(next, driver, text, index);
}
if (cmdName === 'checkTextPresent') {
const [text] = args;
return yield this.checkTextPresent(next, driver, text);
}
if (cmdName === 'askAI') {
const [instruction] = args;
return yield this.askAI(next, driver, instruction);
}
if (cmdName === 'askAISimple') {
const [instruction] = args;
return yield this.askAISimple(next, driver, instruction);
}
return yield next();
});
}
findAndClickText(next_1, driver_1, text_1) {
return __awaiter(this, arguments, void 0, function* (next, driver, text, index = 0) {
try {
if (!driver.getScreenshot)
return;
// Step 1: Capture screenshot
const screenshotBase64 = yield driver.getScreenshot();
console.log('Enhancing screenshot for better OCR results...');
// Step 2: Enhance the screenshot (if Sharp is available)
const enhancedBase64Image = yield this.enhanceScreenshot(screenshotBase64);
// Step 3: Process the enhanced screenshot with OCR
console.log('Processing enhanced screenshot with OCR...');
const result = yield tesseract_js_1.default.recognize(Buffer.from(enhancedBase64Image, 'base64'), 'eng', Object.assign(Object.assign({}, TESSERACT_CONFIG), { logger: (m) => {
if (m.status === 'recognizing text') {
console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
}
} }));
// Filter words by confidence threshold
const words = result.data.words.filter(word => word.confidence >= MIN_CONFIDENCE_THRESHOLD);
console.log('OCR result:', words.map(w => ({ text: w.text, confidence: w.confidence })));
// Step 4: Find all matches for the given text
const matchingWords = words.filter(word => {
const normalizedWord = word.text.toLowerCase().trim();
const normalizedSearchText = text.toLowerCase().trim();
return normalizedWord.includes(normalizedSearchText);
});
if (matchingWords.length === 0) {
throw new Error(`Text "${text}" not found in the screenshot`);
}
if (index < 0 || index >= matchingWords.length) {
throw new Error(`Invalid index "${index}". Found ${matchingWords.length} matches for text "${text}".`);
}
// Step 5: Get the desired match based on index
const targetWord = matchingWords[index];
const { x0, y0, x1, y1 } = targetWord.bbox;
const centerX = (x0 + x1) / 2;
const centerY = (y0 + y1) / 2;
console.log(`Text "${targetWord.text}" found at coordinates: (${centerX}, ${centerY}) with confidence: ${targetWord.confidence}%`);
if (!driver.performActions)
return;
// Step 6: Perform the click action
yield driver.performActions([
{
type: 'pointer',
id: 'finger1',
actions: [
{ type: 'pointerMove', duration: 0, x: centerX, y: centerY },
{ type: 'pointerDown', button: 0 },
{ type: 'pause', duration: 100 },
{ type: 'pointerUp', button: 0 },
],
},
]);
console.log(`Successfully clicked on text "${text}" at index ${index}`);
return {
success: true,
message: `Clicked on text "${text}" at index ${index}`,
totalMatches: matchingWords.length,
confidence: targetWord.confidence,
imageEnhanced: sharp !== null,
};
}
catch (err) {
console.error('Error in findAndClickText:', err);
throw err;
}
});
}
checkTextPresent(next, driver, text) {
return __awaiter(this, void 0, void 0, function* () {
try {
if (!driver.getScreenshot)
return;
// Step 1: Capture screenshot
const screenshotBase64 = yield driver.getScreenshot();
console.log('Enhancing screenshot for OCR text detection...');
// Step 2: Enhance the screenshot (if Sharp is available)
const enhancedBase64Image = yield this.enhanceScreenshot(screenshotBase64);
// Step 3: Process the enhanced screenshot with OCR
console.log('Processing enhanced screenshot with OCR...');
const result = yield tesseract_js_1.default.recognize(Buffer.from(enhancedBase64Image, 'base64'), 'eng', Object.assign(Object.assign({}, TESSERACT_CONFIG), { logger: (m) => {
if (m.status === 'recognizing text') {
console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
}
} }));
// Filter words by confidence threshold
const words = result.data.words.filter(word => word.confidence >= MIN_CONFIDENCE_THRESHOLD);
console.log('OCR result:', words.map(w => ({ text: w.text, confidence: w.confidence })));
// Step 4: Find all matches for the given text
const matchingWords = words.filter(word => {
const normalizedWord = word.text.toLowerCase().trim();
const normalizedSearchText = text.toLowerCase().trim();
return normalizedWord.includes(normalizedSearchText);
});
const isPresent = matchingWords.length > 0;
// Prepare detailed match information
const matches = matchingWords.map(word => ({
text: word.text,
confidence: word.confidence,
coordinates: {
x: (word.bbox.x0 + word.bbox.x1) / 2,
y: (word.bbox.y0 + word.bbox.y1) / 2
},
bbox: word.bbox
}));
console.log(`Text "${text}" detection result: ${isPresent ? 'FOUND' : 'NOT FOUND'} (${matchingWords.length} matches)`);
return {
success: true,
isPresent: isPresent,
totalMatches: matchingWords.length,
searchText: text,
matches: matches,
imageEnhanced: sharp !== null,
message: isPresent
? `Text "${text}" found with ${matchingWords.length} match(es)`
: `Text "${text}" not found in the screenshot`
};
}
catch (err) {
console.error('Error in checkTextPresent:', err);
const errorMessage = err instanceof Error ? err.message : String(err);
return {
success: false,
isPresent: false,
totalMatches: 0,
searchText: text,
matches: [],
imageEnhanced: sharp !== null,
message: `Error checking text presence: ${errorMessage}`,
error: errorMessage
};
}
});
}
// Helper method to enhance the screenshot (with fallback if Sharp not available)
enhanceScreenshot(base64Image) {
return __awaiter(this, void 0, void 0, function* () {
if (!sharp) {
console.log('Sharp not available - using original image without enhancement');
return base64Image;
}
try {
const imageBuffer = Buffer.from(base64Image, 'base64');
const enhancedBuffer = yield sharp(imageBuffer)
.grayscale() // Convert to grayscale
.normalize() // Normalize contrast
.sharpen({
sigma: 1.5,
m1: 1.5,
m2: 2.0,
x1: 2.0,
y2: 10,
y3: 20,
})
.gamma(1.2) // Slightly increase gamma for better text contrast
.median(1) // Remove noise
.threshold(128) // Binary threshold for clearer text
.toBuffer();
return enhancedBuffer.toString('base64');
}
catch (err) {
console.error('Error enhancing screenshot with Sharp, using original:', err);
return base64Image; // Fallback to original image
}
});
}
// New askAI method for Google Vision AI integration
askAI(next, driver, instruction) {
return __awaiter(this, void 0, void 0, function* () {
try {
console.log(`AI Instruction received: ${instruction}`);
const takeANewScreenShot = true;
const sessionId = driver.sessionId || 'unknown-session';
// Capture a screenshot and convert it to base64
const screenshotPath = yield this.getScreenshotPath(driver, sessionId, takeANewScreenShot);
const base64Screenshot = fs.readFileSync(screenshotPath, 'base64');
// Send the instruction and screenshot to Google Vision AI
const response = yield this.askGoogleVisionAI(instruction, base64Screenshot);
return {
success: true,
instruction: instruction,
response: response,
message: 'AI analysis completed successfully'
};
}
catch (err) {
console.error('Error in askAI:', err);
const errorMessage = err instanceof Error ? err.message : String(err);
return {
success: false,
instruction: instruction,
response: null,
message: `Error processing AI request: ${errorMessage}`,
error: errorMessage
};
}
});
}
// Simplified askAI method that returns only the AI response text
askAISimple(next, driver, instruction) {
return __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c, _d, _e;
try {
console.log(`AI Simple Instruction received: ${instruction}`);
const takeANewScreenShot = true;
const sessionId = driver.sessionId || 'unknown-session';
// Capture a screenshot and convert it to base64
const screenshotPath = yield this.getScreenshotPath(driver, sessionId, takeANewScreenShot);
const base64Screenshot = fs.readFileSync(screenshotPath, 'base64');
// Send the instruction and screenshot to Google Vision AI
const response = yield this.askGoogleVisionAI(instruction, base64Screenshot);
// Extract the AI response text
let aiResponseText = ((_e = (_d = (_c = (_b = (_a = response === null || response === void 0 ? void 0 : response.candidates) === null || _a === void 0 ? void 0 : _a[0]) === null || _b === void 0 ? void 0 : _b.content) === null || _c === void 0 ? void 0 : _c.parts) === null || _d === void 0 ? void 0 : _d[0]) === null || _e === void 0 ? void 0 : _e.text) || '';
// Clean the response by removing ```json markers and extra whitespace
aiResponseText = aiResponseText.replace(/^```json\s*/, '').replace(/\s*```$/, '').trim();
return aiResponseText;
}
catch (err) {
console.error('Error in askAISimple:', err);
const errorMessage = err instanceof Error ? err.message : String(err);
throw new Error(`AI request failed: ${errorMessage}`);
}
});
}
// Helper method to get screenshot path
getScreenshotPath(driver_1, sessionId_1) {
return __awaiter(this, arguments, void 0, function* (driver, sessionId, takeNewScreenshot = true) {
if (!driver.getScreenshot) {
throw new Error('Screenshot functionality not available');
}
// Create screenshots directory if it doesn't exist
const screenshotsDir = path.join(process.cwd(), 'screenshots');
if (!fs.existsSync(screenshotsDir)) {
fs.mkdirSync(screenshotsDir, { recursive: true });
}
const screenshotPath = path.join(screenshotsDir, `${sessionId}_${Date.now()}.png`);
if (takeNewScreenshot) {
const screenshotBase64 = yield driver.getScreenshot();
fs.writeFileSync(screenshotPath, screenshotBase64, 'base64');
}
return screenshotPath;
});
}
// Google Vision AI integration
askGoogleVisionAI(instruction, encodedImg) {
return __awaiter(this, void 0, void 0, function* () {
console.log(`Google Vision AI instruction received`);
let response;
try {
// Retrieve Google Cloud environment variables
const projectId = process.env.GOOGLE_PROJECT_ID;
const location = process.env.GOOGLE_LOCATION;
const model = process.env.GOOGLE_MODEL;
// Validate environment variables
if (!projectId || !location || !model) {
throw new Error('Google Cloud environment variables are not set. Required: GOOGLE_PROJECT_ID, GOOGLE_LOCATION, GOOGLE_MODEL');
}
// Call Vertex AI to process the instruction and image
response = yield this.createNonStreamingMultipartContent(projectId, location, model, encodedImg, instruction);
console.log("AI Response:", response);
}
catch (error) {
console.error("Error processing the image or query:", error);
throw error;
}
return response;
});
}
// Google Vertex AI integration
createNonStreamingMultipartContent(projectId, location, model, encodedImg, instruction) {
return __awaiter(this, void 0, void 0, function* () {
if (!VertexAI) {
throw new Error('Google Cloud Vertex AI SDK is not available. Please install it with: npm install @google-cloud/vertexai');
}
try {
// Initialize Vertex AI
const vertexAI = new VertexAI({
project: projectId,
location: location,
});
// Get the generative model
const generativeModel = vertexAI.getGenerativeModel({
model: model,
});
// Prepare the content with proper structure
const contents = [
{
role: 'user',
parts: [
{
text: instruction,
},
{
inlineData: {
data: encodedImg,
mimeType: 'image/png',
},
}
]
}
];
// Generate content with proper request structure
const request = {
contents: contents,
};
const response = yield generativeModel.generateContent(request);
console.log('Vertex AI response received');
return response.response;
}
catch (error) {
console.error('Error calling Vertex AI:', error);
throw new Error(`Vertex AI API error: ${error instanceof Error ? error.message : String(error)}`);
}
});
}
}
exports.OCRClickPlugin = OCRClickPlugin;
// Define a new method map for Appium commands
OCRClickPlugin.newMethodMap = {
'/session/:sessionId/appium/plugin/textclick': {
POST: {
command: 'findAndClickText',
payloadParams: {
required: ['text'],
optional: ['index'],
},
},
},
'/session/:sessionId/appium/plugin/checktext': {
POST: {
command: 'checkTextPresent',
payloadParams: {
required: ['text'],
optional: [],
},
},
},
'/session/:sessionId/appium/plugin/askllm': {
POST: {
command: 'askAI',
payloadParams: {
required: ['instruction'],
optional: [],
},
},
},
'/session/:sessionId/appium/plugin/askllm-simple': {
POST: {
command: 'askAISimple',
payloadParams: {
required: ['instruction'],
optional: [],
},
},
},
};