askui
Version:
Reliable, automated end-to-end-testing that depends on what is shown on your screen instead of the technology you are running on
653 lines (652 loc) • 26.6 kB
TypeScript
import { Exec, Executable, FluentFilters, ApiCommands, PC_AND_MODIFIER_KEY, CommandExecutorContext } from './dsl';
import { UiControllerClientConnectionState } from './ui-controller-client-connection-state';
import { Annotation } from '../core/annotation/annotation';
import { AnnotationRequest } from '../core/model/annotation-result/annotation-interface';
import { DetectedElement } from '../core/model/annotation-result/detected-element';
import { ClientArgs } from './ui-controller-client-interface';
import { ModelCompositionBranch } from './model-composition-branch';
import { AskUIAgent, AgentHistory, ActOptions } from '../core/models/anthropic';
export type RelationsForConvenienceMethods = 'nearestTo' | 'leftOf' | 'above' | 'rightOf' | 'below' | 'contains';
export type TextMatchingOption = 'similar' | 'exact' | 'regex';
export type ElementExistsQueryType = 'otherElement' | 'switch' | 'element' | 'container' | 'checkbox' | 'element' | 'button' | 'table' | 'text' | 'icon' | 'image' | 'textfield';
export interface ElementExistsQueryText {
value: string;
matching?: TextMatchingOption;
}
export interface ElementExistsQueryRelation {
type: RelationsForConvenienceMethods;
text: string;
}
export interface ElementExistsQuery {
type: keyof Pick<FluentFilters, ElementExistsQueryType>;
text?: ElementExistsQueryText;
relation?: ElementExistsQueryRelation;
}
export interface ExpectExistenceElement extends ElementExistsQuery {
exists: boolean;
}
export interface ExpectAllExistResult {
allExist: boolean;
elements: ExpectExistenceElement[];
}
export declare class UiControlClient extends ApiCommands {
private workspaceId;
private executionRuntime;
private stepReporter;
private aiElementArgs;
agent: AskUIAgent;
private constructor();
static build(clientArgs?: ClientArgs): Promise<UiControlClient>;
/**
* Connects to the askui UI Controller.
*/
connect(): Promise<UiControllerClientConnectionState>;
/**
* Disconnects from the askui UI Controller.
*/
disconnect(): void;
/**
* Disconnects from the askui UI Controller.
*
* @deprecated Use {@link disconnect} instead.
*/
close(): void;
startVideoRecording(): Promise<void>;
stopVideoRecording(): Promise<void>;
readVideoRecording(): Promise<string>;
private shouldAnnotateAfterCommandExecution;
private beforeNoneInferenceCallCommandExecution;
private afterCommandExecution;
annotate(annotationRequest?: AnnotationRequest): Promise<Annotation>;
annotateInteractively(): Promise<void>;
private escapeSeparatorString;
private buildInstruction;
private getAIElementsByNames;
fluentCommandExecutor(instructionString: string, modelComposition: ModelCompositionBranch[], context?: CommandExecutorContext): Promise<void>;
getterExecutor(instruction: string, context?: CommandExecutorContext): Promise<DetectedElement[]>;
/**
* Takes a prompt that contains a question you want to be answered
* or the data you want to have extracted from your screen.
*
* The optional 'config' can be used to specifiy the JSON schema the
* returned object shall have (https://json-schema.org).
*
* See the following examples on how to use it:
*
* let isWidgetsNew =
* await aui.ask(
* "Does the sidebar element 'Widgets' have a 'NEW' tag?",
* {
* json_schema: {
* "type": "boolean"
* }
* });
*
* Output of console.log(isWidgetsNew): true
*
* let newClients =
* await aui.ask(
* "How many new clients?",
* {
* json_schema: {
* "type": "number"
* }
* });
*
* Output of console.log(newClients): 9123
*
* let userNames =
* await aui.ask(
* "Return a list with the users names.",
* {
* json_schema: {
* "type": "array",
* "items": {
* "type": "string"
* }
* }
* });
*
* Output of console.log(userNames):
* [
* 'Yiorgos Avraamu',
* 'Avram Tsarios',
* 'Quintin Ed',
* 'Enéas Kwadwo',
* 'Agapetus Tadeáš'
* ]
*
* let users =
* await aui.ask(
* "Extract the users from the table.",
* {
* json_schema: {
* "type": "array",
* "items": {
* "type": "object",
* "properties": {
* "name": {
* "type": "string"
* },
* "usage": {
* "type": "number"
* }
* },
* "additionalProperties": false,
* "required": ["name", "usage"]
* },
* },
* });
*
* Output of console.log(users):
* [
* { name: 'Yiorgos Avraamu', usage: 50 },
* { name: 'Avram Tarasios', usage: 10 },
* { name: 'Quintin Ed', usage: 74 },
* { name: 'Eneás Kwadwo', usage: 98 },
* { name: 'Agapetus Tadeáš', usage: 22 }
* ]
*
* @param {string} prompt - The question you want to be answered or
* the data you want to have extracted.
* @param {Object} config - object that specifies the return json: {json_schema: {...}}.
* @returns {any} - The answer as JSON specified in the config object.
*/
ask(prompt: string, config?: object): Promise<any>;
private secretText;
private getAndResetSecretText;
/**
* Types a text inside the filtered element.
*
* By default, the `text` is included in the logs and sent over to the askui Inference server to
* predict in which context the typing has to occur. You can exclude the `text` from the logs
* and the request to the askui Inference server setting `options.isSecret` to `true`.
* This should not change the quality of the prediction of the askui Inference server. In this
* case, `options.secretMask` is included in logs and sent over instead of the `text`.
*
* @param {string} text - A text to type.
* @param {Object} [options]
* @param {boolean} [options.isSecret = false] - If set to `true`, `text` is neither included in
* logs of askui nor sent over to askui Inference for prediction.
* @param {string} [options.secretMask = '****'] - If `options.isSecret` is set to `true`, this
* is included in logs and sent over to askui Inference for prediction instead of the `text`.
*
* @return {FluentFilters}
*/
typeIn(text: string, { isSecret, secretMask }?: {
isSecret?: boolean;
secretMask?: string;
}): FluentFilters;
/**
* Types a text at the current position.
*
* By default, the `text` is included in the logs and sent over to the askui Inference server to
* predict in which context the typing has to occur. You can exclude the `text` from the logs
* and the request to the askui Inference server setting `options.isSecret` to `true`.
* This should not change the quality of the prediction of the askui Inference server. In this
* case, `options.secretMask` is included in logs and sent over instead of the `text`.
*
* @param {string} text - A text to type.
* @param {Object} options
* @param {boolean} [options.isSecret = false] - If set to `true`, `text` is neither included in
* logs of askui nor sent over to askui Inference for prediction.
* @param {string} [options.secretMask = '****'] - If `options.isSecret` is set to `true`, this
* is included in logs and sent over to askui Inference for prediction instead of the `text`.
*
* @return {Exec}
*/
type(text: string, { isSecret, secretMask }?: {
isSecret?: boolean;
secretMask?: string;
}): Exec;
/**
* Waits for `<delayInMs>` ms, e.g., 1000 ms. The exact delay may be a little longer
* than `<delayInMs>` but never shorter than that.
*
* @param {number} delayInMs - The delay in ms to wait for.
*
* @return {Executable}
*/
waitFor(delayInMs: number): Executable;
/**
* Press a key multiple times. At least two times.
*
* @param {PC_AND_MODIFIER_KEY} key
*
* @param {number} times
*/
pressKeyNTimes(key: PC_AND_MODIFIER_KEY, times?: number): Promise<void>;
/**
* Press an array of keys one after another.
*
* For example press the following keys: right, left, enter.
*
* pressKeys(['right', 'left', 'enter'])
*
* @param {PC_AND_MODIFIER_KEY[]} keys
*/
pressKeys(keys: PC_AND_MODIFIER_KEY[]): Promise<void>;
/**
* Searches for text elements and clicks them
* one after another when found.
*
* @param {string[]} texts - An array of texts to be searched.
*/
clickTexts(texts: string[]): Promise<void>;
/**
* Searches for an element of type textfield with a specific placeholder text.
* If found, clicks it.
*
* @param {string} placeholder - The textfields placeholder text.
*/
clickTextfield(placeholder: string): Promise<void>;
/**
* Searches for an element of type textfield with a specific
* label nearest to it. If found, clicks it.
*
* @param {string} label - The textfields label.
*/
clickTextfieldNearestTo(label: string): Promise<void>;
/**
* Wait until an AskUICommand does not fail.
*
* Use it to wait for an element to appear like this:
*
* await waitUntil(
* aui.expect().text('Github').exists()
* );
*
* @param {Executable} AskUICommand - For example: aui.moveMouse(0, 0)
* @param {number} maxTry - Number of maximum retries
* @param {number} waitTime - Time in milliseconds
*/
waitUntil(AskUICommand: Executable, maxTry?: number, waitTime?: number): Promise<void>;
private evaluateRelation;
/**
* Click a button with a specific label.
* Optional relation identifies the button in relation to another element.
*
* **Examples:**
* ```typescript
* await aui.clickButton({})
* await aui.clickButton({label: 'Checkout here'})
* await aui.clickButton({relation: {type: 'leftOf', text: 'Choose a ticket'}})
* await aui.clickButton({label: 'Click', {relation: {type: 'leftOf', text: 'Choose a ticket'}})
* ```
*
* @param {Object} params - Object containing properties.
* @property {string} [params.label] - The text label of the button. Defaults to an empty string.
* @property {Object} [params.relation] - Object describing the relationship between
* the clicked button and another element.
* @property {RelationsForConvenienceMethods} params.relation.type - The type of relation.
* @property {string} params.relation.text - The text element the relation is based on.
*/
clickButton(params: {
label?: string;
relation?: {
type: RelationsForConvenienceMethods;
text: string;
};
}): Promise<void>;
/**
* Click a checkbox with a specific label.
* You can also specify where the label is placed relationally.
*
* **Examples:**
* ```typescript
* await aui.clickCheckbox({label: 'Toggle'})
* await aui.clickCheckbox({label: 'Toggle', relation: {type: 'leftOf'}})
* ```
*
* @param {Object} params - Object containing required `label` property and
* optional `relation` property.
* @property {string} params.label - The label for the checkbox.
* @property {Object} [params.relation] - Object describing the relationship between
* the clicked checkbox and another element.
* @property {RelationsForConvenienceMethods} params.relation.type - The type of relation.
*/
clickCheckbox(params: {
label: string;
relation?: {
type: RelationsForConvenienceMethods;
};
}): Promise<void>;
/**
* Click a switch with a specific label.
* You can also specify where the label is placed relationally.
*
* **Examples:**
* ```typescript
* await aui.clickSwitch({label: 'Toggle'})
* await aui.clickSwitch({label: 'Toggle', relation: {type: 'leftOf'}})
* ```
*
* @param {Object} params - Object containing required `label` property and
* optional `relation` property.
* @property {string} params.label - The label for the checkbox.
* @property {Object} [params.relation] - Object describing the relationship between
* the clicked checkbox and another element.
* @property {RelationsForConvenienceMethods} params.relation.type - The type of relation.
*/
clickSwitch(params: {
label: string;
relation?: {
type: RelationsForConvenienceMethods;
};
}): Promise<void>;
/**
* Types a given text into a textfield.
* Use a relation to specify how to find
* the textfield in relation to a specific label.
*
* **Examples:**
* ```typescript
* // Finds the textfield nearest to the label 'Email'
* await aui.typeIntoTextfield({textToWrite: 'Hello World', relation: {label: 'Email'}});
*
* // Finds the textfield above/below a label 'Password'
* await aui.typeIntoTextfield(
* {textToWrite: 'Hello World', relation: {type: 'above', label: 'Password'}}
* );
* await aui.typeIntoTextfield(
* {textToWrite: 'Hello World', relation: {type: 'below', label: 'Password'}}
* );
*
* // If there is no label but a placeholder, the label is contained in the textfield
* await aui.typeIntoTextfield(
* {textToWrite: 'Hello World', relation: {type: 'contains', label: 'Enter email'}}
* );
* ```
*
* @param {Object} params - Object containing required `textToWrite` property and
* optional `relation` property.
* @property {string} params.textToWrite - The text to be typed into the textfield.
* @property {Object} params.relation - Object describing the relationship between the
* textfield being interacted with and another element.
* @property {RelationsForConvenienceMethods} params.relation.type - The type of
* relation, optional.
* @property {string} params.relation.label - The label associated with the related
* element, optional.
*/
typeIntoTextfield(params: {
textToWrite: string;
relation: {
type?: RelationsForConvenienceMethods;
label: string;
};
}): Promise<void>;
/**
* Click on a specific text.
* You can also use a RegEx or match the text exactly by specifying the specific flag.
* Use a relation to find the text in relation to a specific text.
*
* **Examples:**
* ```typescript
* // Click text that matches exactly
* await aui.clickText({text: 'askui', matching: 'similar'})
*
* // Click text that contains 'pie' or 'cake' or 'Pie' or 'Cake'
* await aui.clickText({text: '.*([Pp]ie|[Cc]ake).*', matching: 'regex'})
*
* // Click the text 'TERMINAL' that is left of the text 'Ports'
* await aui.clickText({
* text: 'TERMINAL',
* matching: "exact",
* relation: { type: 'leftOf', text: 'PORTS' }
* })
* ```
*
* @param {Object} params - Object containing required `text` property and optional properties
* for regular expression matching and relation.
* @property {string} params.text - The text to be clicked.
* @property {string} params.matching - Whether the text is matched using similarity,
* exact match or a regular expression.
* @property {Object} [params.relation] - Object describing the relationship between the
* clicked text and another element.
* @property {RelationsForConvenienceMethods} params.relation.type - The type of relation.
* @property {string} params.relation.text - The label or text associated with the
* related element or state.
*/
clickText(params: {
text: string;
matching: TextMatchingOption;
relation?: {
type: RelationsForConvenienceMethods;
text: string;
};
}): Promise<void>;
private evaluateMatchingProperty;
/**
* Check if one or multiple elements are detected.
*
* **Examples:**
* ```typescript
* await aui.expectAllExist([
* {
* type: 'text',
* text: {
* value: 'Switch to Dark',
* matching: 'similar'
* }
* },
* ]);
*
* // Check for existence of multiple elements
* await aui.expectAllExist([
* {
* type: 'textfield',
* relation: {
* type: 'rightOf',
* text: 'Email:'
* }
* },
* {
* type: 'element',
* text: {
* value: 'Switch to Dark'
* }
* },
* ]);
*
* // Validate existence
* const exists = await aui.expectAllExist([...]);
* exists.allExist // true when every element exists
*
* // Check which elements do not exist
* // with the elements property
* const nonExistentElements = exists.elements.filter((e) => e.exists===false)
* ```
*
* @param {ElementExistsQuery[]} query - Objects containing the required property
* 'type' and the optional properties
* 'text' and 'relation'.
* @property {string} query.type - The type of the element: 'otherElement' | 'switch' |
* 'element' | 'container' | 'checkbox' | 'element' |
* 'button' | 'table' | 'text' | 'icon' | 'image' | 'textfield'
* @property {Object} [query.text] - Object containing value and matching strategy.
* @property {string} query.text.value - The text to match for.
* @property {string} [query.text.matching] - Whether the text is matched using similarity,
* exact match or a regular expression.
* @property {Object} [query.relation] - Object describing the relationship between the
* clicked text and another element.
* @property {RelationsForConvenienceMethods} query.relation.type - The type of relation.
* @property {string} query.relation.text - The label or text associated with the
* related element or state.
* @returns {ExpectAllExistResult.allExist} - If every element exists.
* @returns {ExpectAllExistResult.elements} - ExpectExistenceElement[].
*/
expectAllExist(query: ElementExistsQuery[]): Promise<ExpectAllExistResult>;
/**
* Holds down a key on the keyboard.
*
* **Examples:**
* ```typescript
* await aui.keyDown('a').exec();
* ```
*
* @param {PC_AND_MODIFIER_KEY} key - The key to hold down.
*/
keyDown(key: PC_AND_MODIFIER_KEY): Executable;
/**
* Releases a key up that was previously held down.
*
* **Examples:**
* ```typescript
* await aui.keyUp('a').exec();
* ```
*
* @param {PC_AND_MODIFIER_KEY} key - The key to release up.
*/
keyUp(key: PC_AND_MODIFIER_KEY): Executable;
/**
* Instructs the agent to autonomously achieve a specified goal through UI interactions.
*
* This method enables AI-powered automation by allowing the agent to:
* - Analyze the current screen state and/or provided images
* - Plan and execute a sequence of UI interactions
* - Handle complex tasks through natural language instructions
* - Maintain context across multiple actions
*
* The agent can perform various UI interactions including:
* - Clicking buttons, links, and other interactive elements
* - Typing text into input fields
* - Scrolling and navigating through interfaces
*
* ### Method Signatures
* ```typescript
* act(goal: string, options?: ActOptions): Promise<AgentHistory>
* act(goal: string, imagePathOrBase64: string, options?: ActOptions): Promise<AgentHistory>
* ```
*
* ### Parameters
* @param goal - A natural language instruction describing the task to accomplish.
* Be specific and clear about the desired outcome.
* @param imagePathOrBase64 - (Optional) Path to an image file or base64-encoded image string.
* Used to provide additional visual context for the task.
* @param options - (Optional) Configuration options for the agent's behavior.
* @param options.chatId - A unique identifier to maintain context between related actions.
* Useful for multi-step tasks that require state preservation.
* @param options.agentHistory
* - (Optional) Previous interaction history to share between
* different agent instances. Enables cross-platform task coordination.
*
* ### Returns
* @returns Promise<AgentHistory> - A promise that resolves to the updated interaction history,
* containing details about the actions taken and their outcomes.
*
* ### Throws
* - If the agent is not properly connected
* - If the provided goal cannot be understood or executed
* - If required UI elements are not found or accessible
* - If the image path is invalid or the base64 string is malformed
*
* ### Examples
*
* #### Basic Usage
* ```typescript
* // Simple task execution
* await aui.act("Open Chrome and navigate to google.com");
* ```
*
* #### Maintaining Context
* ```typescript
* // Multi-step task with context preservation
* await aui.act("Search for current gold prices", {
* chatId: "gold-price-task"
* });
*
* await aui.act("Create a new text file and save the price", {
* chatId: "gold-price-task"
* });
* ```
*
* #### Cross-Platform Coordination
* ```typescript
* // Share context between desktop and mobile agents
* await auiAndroid.agent.configureAsAndroidAgent();
*
* const history = await auiDesktop.act("Copy username from desktop app");
* await auiAndroid.act("Paste username into mobile login", {
* agentHistory: history
* });
* ```
*
* #### Using Images for Context
* ```typescript
* // Using image file
* await aui.act(
* "Click the 'Submit' button in the provided image",
* 'path/to/screenshot.png'
* );
*
* // Using base64 image
* const base64Image = "...";
* await aui.act(
* "Click the 'Submit' button in the provided image",
* base64Image
* );
* ```
*
* ### Best Practices
* 1. Be specific in your goal descriptions
* 2. Use chatId for related tasks to maintain context
* 3. Provide clear visual context when needed
* 4. Handle errors appropriately in your implementation
* 5. Consider using agentHistory for complex cross-platform workflows
*/
act(goal: string, options?: ActOptions): Promise<AgentHistory>;
act(goal: string, imagePathOrBase64String: string, options?: ActOptions): Promise<AgentHistory>;
/**
* Adds tools to the agent that allow it to interact with AI elements.
*
* @returns {Promise<void>} - A promise that resolves when the tools are added to the agent.
*/
addAIElementsToolsToAgent(): Promise<void>;
/**
* Retrieves the starting arguments used when the controller server was initialized.
*
* Useful for debugging, logging, or verifying the current server configuration.
*
* @property {string} displayNum - Display number controlled by the controller
* @property {boolean} minimize - Whether controller starts minimized
* @property {string} runtime - Runtime type ("desktop" or "android")
* @property {number} port - Communication port
* @property {number} actionWaitTime - Action wait time
* @property {string} host - Host address
* @property {string} logFile - Log file path
* @property {boolean} hideOverlay - Whether overlay is hidden
* @property {boolean} debugDraw - Whether debug drawing is enabled
* @property {string} deviceId - Android device ID
* @property {string} configFile - Configuration file path
* @property {string} logLevel - Logging level
*
* @example
* ```typescript
* const startingArguments = await aui.getControllerStartingArguments();
* console.log(startingArguments);
* // Output example:
* // {
* // displayNum: 0,
* // minimize: true,
* // runtime: 'desktop',
* // port: 5000,
* // actionWaitTime: 1000,
* // host: '127.0.0.1',
* // logFile: '/tmp/askui/askui-server.log',
* // hideOverlay: false,
* // debugDraw: false,
* // deviceId: 'emulator-5554',
* // configFile: '/tmp/askui/askui-config.json',
* // logLevel: 'info',
* // }
* ```
*
* @example Retrieving Android device ID:
* ```typescript
* const startingArguments = await aui.getControllerStartingArguments();
* console.log(startingArguments.deviceId);
* // Output example: "emulator-5554"
* ```
*/
getControllerStartingArguments(): Promise<Record<'displayNum' | 'minimize' | 'runtime' | 'port' | 'actionWaitTime' | 'host' | 'logFile' | 'hideOverlay' | 'debugDraw' | 'deviceId' | 'configFile' | 'logLevel', string | number | boolean>>;
}