@skypilot/scraper
Version:
Node-based scriptable web scraper
150 lines (149 loc) • 4.71 kB
TypeScript
import type { Integer } from '@skypilot/common-types';
import type { ElementHandle } from 'playwright';
import type { State } from 'src/scraper/State';
import { SliceRange } from 'src/lib/array/sliceArray';
import type { Dict, Href } from './pseudotypes';
export interface BrowserOptions {
delayInMs?: Integer;
headless?: boolean;
userAgent?: string;
variableDelayInMs?: Integer;
}
export interface ClickOptions {
button?: 'left' | 'right' | 'middle';
/**
* defaults to 1. See [UIEvent.detail].
*/
clickCount?: number;
/**
* Time to wait between `mousedown` and `mouseup` in milliseconds. Defaults to 0.
*/
delay?: number;
/**
* Whether to bypass the [actionability](https://playwright.dev/docs/actionability) checks. Defaults to `false`.
*/
force?: boolean;
/**
* Modifier keys to press. Ensures that only these modifiers are pressed during the operation, and then restores
* current modifiers back. If not specified, currently pressed modifiers are used.
*/
modifiers?: Array<'Alt' | 'Control' | 'Meta' | 'Shift'>;
/**
* Actions that initiate navigations are waiting for these navigations to happen and for pages to start loading. You
* can opt out of waiting via setting this flag. You would only need this option in the exceptional cases such as
* navigating to inaccessible pages. Defaults to `false`.
*/
noWaitAfter?: boolean;
/**
* A point to use relative to the top-left corner of element padding box. If not specified, uses some visible point
* of the element.
*/
position?: {
x: number;
y: number;
};
/**
* Maximum time in milliseconds, defaults to 30 seconds, pass `0` to disable timeout. The default value can be changed
* by using the
* [browserContext.setDefaultTimeout(timeout)](https://playwright.dev/docs/api/class-browsercontext#browsercontextsetdefaulttimeouttimeout)
* or [page.setDefaultTimeout(timeout)](https://playwright.dev/docs/api/class-page#pagesetdefaulttimeouttimeout) methods.
*/
timeout?: number;
throwOnWaitTimeout?: boolean;
waitTimeoutInMs?: Integer;
}
export declare type FlexQueryDict = Record<Exclude<string, ''>, string | Query>;
export interface NavOptions {
addUrlToState?: boolean;
newPage?: boolean;
retryLimit?: Integer;
state?: State;
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle';
validate?: (url: string) => {
messages?: string[];
} | null | undefined;
verbose?: boolean;
}
export declare type QueryDict = Record<Exclude<string, ''>, Query>;
export interface QueryOptions {
baseRef?: ElementHandle;
state?: State;
statePath?: string;
transform?: (queryResults: any, scraperContext: ScraperContext) => any;
updateState?: boolean;
}
export declare type QueryResult<Q extends Record<string, unknown> = Record<string, unknown>> = Record<keyof Q, string | string[]>;
export interface RunOptions {
baseRef?: ElementHandle;
collectionName?: string;
displayLog?: boolean;
nth?: Integer;
retryLimit?: Integer;
state?: State;
statePath?: string;
verbose?: boolean;
}
export interface RunOnOptions {
baseRef?: ElementHandle;
collectionName?: string;
state?: State;
}
export declare type ClickCommand = {
action: 'click';
query: Query;
options?: ClickOptions;
};
export declare type FollowCommand = {
action: 'follow';
query: Query;
options?: NavOptions;
};
export declare type GoToCommand = {
action: 'goTo';
url: Href;
options?: NavOptions;
};
export declare type QueryCommand = {
action: 'query';
queryDict: QueryDict;
options?: QueryOptions;
};
export declare type RunOnAllCommand = {
action: 'runOnAll';
query: Query;
commands: ScraperCommand[];
options?: RunOnOptions;
};
export declare type SetCommand = {
action: 'set';
state: Dict;
};
export declare type WriteCommand = {
action: 'write';
collectionName?: string;
};
export declare type ScraperCommand = ClickCommand | FollowCommand | GoToCommand | QueryCommand | RunOnAllCommand | SetCommand | WriteCommand;
export interface ScraperContext {
browser?: {
version?: string;
};
page: {
title?: string;
url?: string;
};
}
export interface Query {
baseRef?: ElementHandle;
attr?: string;
nth?: Integer;
sel: string;
limit?: Integer;
scope?: 'one' | 'all';
slice?: SliceRange;
transform?: (input: any, scraperContext: ScraperContext) => any;
noTrim?: boolean;
}
export interface StateOptions {
state?: State;
statePath?: string;
}