UNPKG

askui

Version:

Reliable, automated end-to-end-testing that depends on what is shown on your screen instead of the technology you are running on

653 lines (652 loc) 26.6 kB
import { Exec, Executable, FluentFilters, ApiCommands, PC_AND_MODIFIER_KEY, CommandExecutorContext } from './dsl'; import { UiControllerClientConnectionState } from './ui-controller-client-connection-state'; import { Annotation } from '../core/annotation/annotation'; import { AnnotationRequest } from '../core/model/annotation-result/annotation-interface'; import { DetectedElement } from '../core/model/annotation-result/detected-element'; import { ClientArgs } from './ui-controller-client-interface'; import { ModelCompositionBranch } from './model-composition-branch'; import { AskUIAgent, AgentHistory, ActOptions } from '../core/models/anthropic'; export type RelationsForConvenienceMethods = 'nearestTo' | 'leftOf' | 'above' | 'rightOf' | 'below' | 'contains'; export type TextMatchingOption = 'similar' | 'exact' | 'regex'; export type ElementExistsQueryType = 'otherElement' | 'switch' | 'element' | 'container' | 'checkbox' | 'element' | 'button' | 'table' | 'text' | 'icon' | 'image' | 'textfield'; export interface ElementExistsQueryText { value: string; matching?: TextMatchingOption; } export interface ElementExistsQueryRelation { type: RelationsForConvenienceMethods; text: string; } export interface ElementExistsQuery { type: keyof Pick<FluentFilters, ElementExistsQueryType>; text?: ElementExistsQueryText; relation?: ElementExistsQueryRelation; } export interface ExpectExistenceElement extends ElementExistsQuery { exists: boolean; } export interface ExpectAllExistResult { allExist: boolean; elements: ExpectExistenceElement[]; } export declare class UiControlClient extends ApiCommands { private workspaceId; private executionRuntime; private stepReporter; private aiElementArgs; agent: AskUIAgent; private constructor(); static build(clientArgs?: ClientArgs): Promise<UiControlClient>; /** * Connects to the askui UI Controller. */ connect(): Promise<UiControllerClientConnectionState>; /** * Disconnects from the askui UI Controller. */ disconnect(): void; /** * Disconnects from the askui UI Controller. * * @deprecated Use {@link disconnect} instead. */ close(): void; startVideoRecording(): Promise<void>; stopVideoRecording(): Promise<void>; readVideoRecording(): Promise<string>; private shouldAnnotateAfterCommandExecution; private beforeNoneInferenceCallCommandExecution; private afterCommandExecution; annotate(annotationRequest?: AnnotationRequest): Promise<Annotation>; annotateInteractively(): Promise<void>; private escapeSeparatorString; private buildInstruction; private getAIElementsByNames; fluentCommandExecutor(instructionString: string, modelComposition: ModelCompositionBranch[], context?: CommandExecutorContext): Promise<void>; getterExecutor(instruction: string, context?: CommandExecutorContext): Promise<DetectedElement[]>; /** * Takes a prompt that contains a question you want to be answered * or the data you want to have extracted from your screen. * * The optional 'config' can be used to specifiy the JSON schema the * returned object shall have (https://json-schema.org). * * See the following examples on how to use it: * * let isWidgetsNew = * await aui.ask( * "Does the sidebar element 'Widgets' have a 'NEW' tag?", * { * json_schema: { * "type": "boolean" * } * }); * * Output of console.log(isWidgetsNew): true * * let newClients = * await aui.ask( * "How many new clients?", * { * json_schema: { * "type": "number" * } * }); * * Output of console.log(newClients): 9123 * * let userNames = * await aui.ask( * "Return a list with the users names.", * { * json_schema: { * "type": "array", * "items": { * "type": "string" * } * } * }); * * Output of console.log(userNames): * [ * 'Yiorgos Avraamu', * 'Avram Tsarios', * 'Quintin Ed', * 'Enéas Kwadwo', * 'Agapetus Tadeáš' * ] * * let users = * await aui.ask( * "Extract the users from the table.", * { * json_schema: { * "type": "array", * "items": { * "type": "object", * "properties": { * "name": { * "type": "string" * }, * "usage": { * "type": "number" * } * }, * "additionalProperties": false, * "required": ["name", "usage"] * }, * }, * }); * * Output of console.log(users): * [ * { name: 'Yiorgos Avraamu', usage: 50 }, * { name: 'Avram Tarasios', usage: 10 }, * { name: 'Quintin Ed', usage: 74 }, * { name: 'Eneás Kwadwo', usage: 98 }, * { name: 'Agapetus Tadeáš', usage: 22 } * ] * * @param {string} prompt - The question you want to be answered or * the data you want to have extracted. * @param {Object} config - object that specifies the return json: {json_schema: {...}}. * @returns {any} - The answer as JSON specified in the config object. */ ask(prompt: string, config?: object): Promise<any>; private secretText; private getAndResetSecretText; /** * Types a text inside the filtered element. * * By default, the `text` is included in the logs and sent over to the askui Inference server to * predict in which context the typing has to occur. You can exclude the `text` from the logs * and the request to the askui Inference server setting `options.isSecret` to `true`. * This should not change the quality of the prediction of the askui Inference server. In this * case, `options.secretMask` is included in logs and sent over instead of the `text`. * * @param {string} text - A text to type. * @param {Object} [options] * @param {boolean} [options.isSecret = false] - If set to `true`, `text` is neither included in * logs of askui nor sent over to askui Inference for prediction. * @param {string} [options.secretMask = '****'] - If `options.isSecret` is set to `true`, this * is included in logs and sent over to askui Inference for prediction instead of the `text`. * * @return {FluentFilters} */ typeIn(text: string, { isSecret, secretMask }?: { isSecret?: boolean; secretMask?: string; }): FluentFilters; /** * Types a text at the current position. * * By default, the `text` is included in the logs and sent over to the askui Inference server to * predict in which context the typing has to occur. You can exclude the `text` from the logs * and the request to the askui Inference server setting `options.isSecret` to `true`. * This should not change the quality of the prediction of the askui Inference server. In this * case, `options.secretMask` is included in logs and sent over instead of the `text`. * * @param {string} text - A text to type. * @param {Object} options * @param {boolean} [options.isSecret = false] - If set to `true`, `text` is neither included in * logs of askui nor sent over to askui Inference for prediction. * @param {string} [options.secretMask = '****'] - If `options.isSecret` is set to `true`, this * is included in logs and sent over to askui Inference for prediction instead of the `text`. * * @return {Exec} */ type(text: string, { isSecret, secretMask }?: { isSecret?: boolean; secretMask?: string; }): Exec; /** * Waits for `<delayInMs>` ms, e.g., 1000 ms. The exact delay may be a little longer * than `<delayInMs>` but never shorter than that. * * @param {number} delayInMs - The delay in ms to wait for. * * @return {Executable} */ waitFor(delayInMs: number): Executable; /** * Press a key multiple times. At least two times. * * @param {PC_AND_MODIFIER_KEY} key * * @param {number} times */ pressKeyNTimes(key: PC_AND_MODIFIER_KEY, times?: number): Promise<void>; /** * Press an array of keys one after another. * * For example press the following keys: right, left, enter. * * pressKeys(['right', 'left', 'enter']) * * @param {PC_AND_MODIFIER_KEY[]} keys */ pressKeys(keys: PC_AND_MODIFIER_KEY[]): Promise<void>; /** * Searches for text elements and clicks them * one after another when found. * * @param {string[]} texts - An array of texts to be searched. */ clickTexts(texts: string[]): Promise<void>; /** * Searches for an element of type textfield with a specific placeholder text. * If found, clicks it. * * @param {string} placeholder - The textfields placeholder text. */ clickTextfield(placeholder: string): Promise<void>; /** * Searches for an element of type textfield with a specific * label nearest to it. If found, clicks it. * * @param {string} label - The textfields label. */ clickTextfieldNearestTo(label: string): Promise<void>; /** * Wait until an AskUICommand does not fail. * * Use it to wait for an element to appear like this: * * await waitUntil( * aui.expect().text('Github').exists() * ); * * @param {Executable} AskUICommand - For example: aui.moveMouse(0, 0) * @param {number} maxTry - Number of maximum retries * @param {number} waitTime - Time in milliseconds */ waitUntil(AskUICommand: Executable, maxTry?: number, waitTime?: number): Promise<void>; private evaluateRelation; /** * Click a button with a specific label. * Optional relation identifies the button in relation to another element. * * **Examples:** * ```typescript * await aui.clickButton({}) * await aui.clickButton({label: 'Checkout here'}) * await aui.clickButton({relation: {type: 'leftOf', text: 'Choose a ticket'}}) * await aui.clickButton({label: 'Click', {relation: {type: 'leftOf', text: 'Choose a ticket'}}) * ``` * * @param {Object} params - Object containing properties. * @property {string} [params.label] - The text label of the button. Defaults to an empty string. * @property {Object} [params.relation] - Object describing the relationship between * the clicked button and another element. * @property {RelationsForConvenienceMethods} params.relation.type - The type of relation. * @property {string} params.relation.text - The text element the relation is based on. */ clickButton(params: { label?: string; relation?: { type: RelationsForConvenienceMethods; text: string; }; }): Promise<void>; /** * Click a checkbox with a specific label. * You can also specify where the label is placed relationally. * * **Examples:** * ```typescript * await aui.clickCheckbox({label: 'Toggle'}) * await aui.clickCheckbox({label: 'Toggle', relation: {type: 'leftOf'}}) * ``` * * @param {Object} params - Object containing required `label` property and * optional `relation` property. * @property {string} params.label - The label for the checkbox. * @property {Object} [params.relation] - Object describing the relationship between * the clicked checkbox and another element. * @property {RelationsForConvenienceMethods} params.relation.type - The type of relation. */ clickCheckbox(params: { label: string; relation?: { type: RelationsForConvenienceMethods; }; }): Promise<void>; /** * Click a switch with a specific label. * You can also specify where the label is placed relationally. * * **Examples:** * ```typescript * await aui.clickSwitch({label: 'Toggle'}) * await aui.clickSwitch({label: 'Toggle', relation: {type: 'leftOf'}}) * ``` * * @param {Object} params - Object containing required `label` property and * optional `relation` property. * @property {string} params.label - The label for the checkbox. * @property {Object} [params.relation] - Object describing the relationship between * the clicked checkbox and another element. * @property {RelationsForConvenienceMethods} params.relation.type - The type of relation. */ clickSwitch(params: { label: string; relation?: { type: RelationsForConvenienceMethods; }; }): Promise<void>; /** * Types a given text into a textfield. * Use a relation to specify how to find * the textfield in relation to a specific label. * * **Examples:** * ```typescript * // Finds the textfield nearest to the label 'Email' * await aui.typeIntoTextfield({textToWrite: 'Hello World', relation: {label: 'Email'}}); * * // Finds the textfield above/below a label 'Password' * await aui.typeIntoTextfield( * {textToWrite: 'Hello World', relation: {type: 'above', label: 'Password'}} * ); * await aui.typeIntoTextfield( * {textToWrite: 'Hello World', relation: {type: 'below', label: 'Password'}} * ); * * // If there is no label but a placeholder, the label is contained in the textfield * await aui.typeIntoTextfield( * {textToWrite: 'Hello World', relation: {type: 'contains', label: 'Enter email'}} * ); * ``` * * @param {Object} params - Object containing required `textToWrite` property and * optional `relation` property. * @property {string} params.textToWrite - The text to be typed into the textfield. * @property {Object} params.relation - Object describing the relationship between the * textfield being interacted with and another element. * @property {RelationsForConvenienceMethods} params.relation.type - The type of * relation, optional. * @property {string} params.relation.label - The label associated with the related * element, optional. */ typeIntoTextfield(params: { textToWrite: string; relation: { type?: RelationsForConvenienceMethods; label: string; }; }): Promise<void>; /** * Click on a specific text. * You can also use a RegEx or match the text exactly by specifying the specific flag. * Use a relation to find the text in relation to a specific text. * * **Examples:** * ```typescript * // Click text that matches exactly * await aui.clickText({text: 'askui', matching: 'similar'}) * * // Click text that contains 'pie' or 'cake' or 'Pie' or 'Cake' * await aui.clickText({text: '.*([Pp]ie|[Cc]ake).*', matching: 'regex'}) * * // Click the text 'TERMINAL' that is left of the text 'Ports' * await aui.clickText({ * text: 'TERMINAL', * matching: "exact", * relation: { type: 'leftOf', text: 'PORTS' } * }) * ``` * * @param {Object} params - Object containing required `text` property and optional properties * for regular expression matching and relation. * @property {string} params.text - The text to be clicked. * @property {string} params.matching - Whether the text is matched using similarity, * exact match or a regular expression. * @property {Object} [params.relation] - Object describing the relationship between the * clicked text and another element. * @property {RelationsForConvenienceMethods} params.relation.type - The type of relation. * @property {string} params.relation.text - The label or text associated with the * related element or state. */ clickText(params: { text: string; matching: TextMatchingOption; relation?: { type: RelationsForConvenienceMethods; text: string; }; }): Promise<void>; private evaluateMatchingProperty; /** * Check if one or multiple elements are detected. * * **Examples:** * ```typescript * await aui.expectAllExist([ * { * type: 'text', * text: { * value: 'Switch to Dark', * matching: 'similar' * } * }, * ]); * * // Check for existence of multiple elements * await aui.expectAllExist([ * { * type: 'textfield', * relation: { * type: 'rightOf', * text: 'Email:' * } * }, * { * type: 'element', * text: { * value: 'Switch to Dark' * } * }, * ]); * * // Validate existence * const exists = await aui.expectAllExist([...]); * exists.allExist // true when every element exists * * // Check which elements do not exist * // with the elements property * const nonExistentElements = exists.elements.filter((e) => e.exists===false) * ``` * * @param {ElementExistsQuery[]} query - Objects containing the required property * 'type' and the optional properties * 'text' and 'relation'. * @property {string} query.type - The type of the element: 'otherElement' | 'switch' | * 'element' | 'container' | 'checkbox' | 'element' | * 'button' | 'table' | 'text' | 'icon' | 'image' | 'textfield' * @property {Object} [query.text] - Object containing value and matching strategy. * @property {string} query.text.value - The text to match for. * @property {string} [query.text.matching] - Whether the text is matched using similarity, * exact match or a regular expression. * @property {Object} [query.relation] - Object describing the relationship between the * clicked text and another element. * @property {RelationsForConvenienceMethods} query.relation.type - The type of relation. * @property {string} query.relation.text - The label or text associated with the * related element or state. * @returns {ExpectAllExistResult.allExist} - If every element exists. * @returns {ExpectAllExistResult.elements} - ExpectExistenceElement[]. */ expectAllExist(query: ElementExistsQuery[]): Promise<ExpectAllExistResult>; /** * Holds down a key on the keyboard. * * **Examples:** * ```typescript * await aui.keyDown('a').exec(); * ``` * * @param {PC_AND_MODIFIER_KEY} key - The key to hold down. */ keyDown(key: PC_AND_MODIFIER_KEY): Executable; /** * Releases a key up that was previously held down. * * **Examples:** * ```typescript * await aui.keyUp('a').exec(); * ``` * * @param {PC_AND_MODIFIER_KEY} key - The key to release up. */ keyUp(key: PC_AND_MODIFIER_KEY): Executable; /** * Instructs the agent to autonomously achieve a specified goal through UI interactions. * * This method enables AI-powered automation by allowing the agent to: * - Analyze the current screen state and/or provided images * - Plan and execute a sequence of UI interactions * - Handle complex tasks through natural language instructions * - Maintain context across multiple actions * * The agent can perform various UI interactions including: * - Clicking buttons, links, and other interactive elements * - Typing text into input fields * - Scrolling and navigating through interfaces * * ### Method Signatures * ```typescript * act(goal: string, options?: ActOptions): Promise<AgentHistory> * act(goal: string, imagePathOrBase64: string, options?: ActOptions): Promise<AgentHistory> * ``` * * ### Parameters * @param goal - A natural language instruction describing the task to accomplish. * Be specific and clear about the desired outcome. * @param imagePathOrBase64 - (Optional) Path to an image file or base64-encoded image string. * Used to provide additional visual context for the task. * @param options - (Optional) Configuration options for the agent's behavior. * @param options.chatId - A unique identifier to maintain context between related actions. * Useful for multi-step tasks that require state preservation. * @param options.agentHistory * - (Optional) Previous interaction history to share between * different agent instances. Enables cross-platform task coordination. * * ### Returns * @returns Promise<AgentHistory> - A promise that resolves to the updated interaction history, * containing details about the actions taken and their outcomes. * * ### Throws * - If the agent is not properly connected * - If the provided goal cannot be understood or executed * - If required UI elements are not found or accessible * - If the image path is invalid or the base64 string is malformed * * ### Examples * * #### Basic Usage * ```typescript * // Simple task execution * await aui.act("Open Chrome and navigate to google.com"); * ``` * * #### Maintaining Context * ```typescript * // Multi-step task with context preservation * await aui.act("Search for current gold prices", { * chatId: "gold-price-task" * }); * * await aui.act("Create a new text file and save the price", { * chatId: "gold-price-task" * }); * ``` * * #### Cross-Platform Coordination * ```typescript * // Share context between desktop and mobile agents * await auiAndroid.agent.configureAsAndroidAgent(); * * const history = await auiDesktop.act("Copy username from desktop app"); * await auiAndroid.act("Paste username into mobile login", { * agentHistory: history * }); * ``` * * #### Using Images for Context * ```typescript * // Using image file * await aui.act( * "Click the 'Submit' button in the provided image", * 'path/to/screenshot.png' * ); * * // Using base64 image * const base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."; * await aui.act( * "Click the 'Submit' button in the provided image", * base64Image * ); * ``` * * ### Best Practices * 1. Be specific in your goal descriptions * 2. Use chatId for related tasks to maintain context * 3. Provide clear visual context when needed * 4. Handle errors appropriately in your implementation * 5. Consider using agentHistory for complex cross-platform workflows */ act(goal: string, options?: ActOptions): Promise<AgentHistory>; act(goal: string, imagePathOrBase64String: string, options?: ActOptions): Promise<AgentHistory>; /** * Adds tools to the agent that allow it to interact with AI elements. * * @returns {Promise<void>} - A promise that resolves when the tools are added to the agent. */ addAIElementsToolsToAgent(): Promise<void>; /** * Retrieves the starting arguments used when the controller server was initialized. * * Useful for debugging, logging, or verifying the current server configuration. * * @property {string} displayNum - Display number controlled by the controller * @property {boolean} minimize - Whether controller starts minimized * @property {string} runtime - Runtime type ("desktop" or "android") * @property {number} port - Communication port * @property {number} actionWaitTime - Action wait time * @property {string} host - Host address * @property {string} logFile - Log file path * @property {boolean} hideOverlay - Whether overlay is hidden * @property {boolean} debugDraw - Whether debug drawing is enabled * @property {string} deviceId - Android device ID * @property {string} configFile - Configuration file path * @property {string} logLevel - Logging level * * @example * ```typescript * const startingArguments = await aui.getControllerStartingArguments(); * console.log(startingArguments); * // Output example: * // { * // displayNum: 0, * // minimize: true, * // runtime: 'desktop', * // port: 5000, * // actionWaitTime: 1000, * // host: '127.0.0.1', * // logFile: '/tmp/askui/askui-server.log', * // hideOverlay: false, * // debugDraw: false, * // deviceId: 'emulator-5554', * // configFile: '/tmp/askui/askui-config.json', * // logLevel: 'info', * // } * ``` * * @example Retrieving Android device ID: * ```typescript * const startingArguments = await aui.getControllerStartingArguments(); * console.log(startingArguments.deviceId); * // Output example: "emulator-5554" * ``` */ getControllerStartingArguments(): Promise<Record<'displayNum' | 'minimize' | 'runtime' | 'port' | 'actionWaitTime' | 'host' | 'logFile' | 'hideOverlay' | 'debugDraw' | 'deviceId' | 'configFile' | 'logLevel', string | number | boolean>>; }