@crawlee/playwright
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
299 lines • 18.2 kB
TypeScript
import type { BrowserCrawlerOptions, BrowserCrawlingContext, BrowserHook, BrowserRequestHandler, GetUserDataFromRequest, LoadedContext, RouterRoutes } from '@crawlee/browser';
import { BrowserCrawler, Configuration } from '@crawlee/browser';
import type { PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool';
import type { Dictionary } from '@crawlee/types';
// @ts-ignore optional peer dependency or compatibility with es2022
import type { LaunchOptions, Page, Response } from 'playwright';
import type { PlaywrightLaunchContext } from './playwright-launcher';
import type { DirectNavigationOptions, PlaywrightContextUtils } from './utils/playwright-utils';
export interface PlaywrightCrawlingContext<UserData extends Dictionary = Dictionary> extends BrowserCrawlingContext<PlaywrightCrawler, Page, Response, PlaywrightController, UserData>, PlaywrightContextUtils {
}
// @ts-ignore optional peer dependency or compatibility with es2022
export interface PlaywrightHook extends BrowserHook<PlaywrightCrawlingContext, PlaywrightGotoOptions> {
}
export interface PlaywrightRequestHandler extends BrowserRequestHandler<LoadedContext<PlaywrightCrawlingContext>> {
}
export type PlaywrightGotoOptions = Dictionary & Parameters<Page['goto']>[1];
export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<PlaywrightCrawlingContext, {
browserPlugins: [PlaywrightPlugin];
}> {
/**
* The same options as used by {@link launchPlaywright}.
*/
launchContext?: PlaywrightLaunchContext;
/**
* Function that is called to process each request.
*
* The function receives the {@link PlaywrightCrawlingContext} as an argument, where:
* - `request` is an instance of the {@link Request} object with details about the URL to open, HTTP method etc.
* - `page` is an instance of the `Playwright`
* [`Page`](https://playwright.dev/docs/api/class-page)
* - `browserController` is an instance of the
* [`BrowserController`](https://github.com/apify/browser-pool#browsercontroller),
* - `response` is an instance of the `Playwright`
* [`Response`](https://playwright.dev/docs/api/class-response),
* which is the main resource response as returned by `page.goto(request.url)`.
*
* The function must return a promise, which is then awaited by the crawler.
*
* If the function throws an exception, the crawler will try to re-crawl the
* request later, up to `option.maxRequestRetries` times.
* If all the retries fail, the crawler calls the function
* provided to the `failedRequestHandler` parameter.
* To make this work, you should **always**
* let your function throw exceptions rather than catch them.
* The exceptions are logged to the request using the
* {@link Request.pushErrorMessage} function.
*/
requestHandler?: PlaywrightRequestHandler;
/**
* Function that is called to process each request.
*
* The function receives the {@link PlaywrightCrawlingContext} as an argument, where:
* - `request` is an instance of the {@link Request} object with details about the URL to open, HTTP method etc.
* - `page` is an instance of the `Playwright`
* [`Page`](https://playwright.dev/docs/api/class-page)
* - `browserController` is an instance of the
* [`BrowserController`](https://github.com/apify/browser-pool#browsercontroller),
* - `response` is an instance of the `Playwright`
* [`Response`](https://playwright.dev/docs/api/class-response),
* which is the main resource response as returned by `page.goto(request.url)`.
*
* The function must return a promise, which is then awaited by the crawler.
*
* If the function throws an exception, the crawler will try to re-crawl the
* request later, up to `option.maxRequestRetries` times.
* If all the retries fail, the crawler calls the function
* provided to the `failedRequestHandler` parameter.
* To make this work, you should **always**
* let your function throw exceptions rather than catch them.
* The exceptions are logged to the request using the
* {@link Request.pushErrorMessage} function.
*
* @deprecated `handlePageFunction` has been renamed to `requestHandler` and will be removed in a future version.
* @ignore
*/
handlePageFunction?: PlaywrightRequestHandler;
/**
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`,
* which are passed to the `page.goto()` function the crawler calls to navigate.
* Example:
* ```
* preNavigationHooks: [
* async (crawlingContext, gotoOptions) => {
* const { page } = crawlingContext;
* await page.evaluate((attr) => { window.foo = attr; }, 'bar');
* },
* ]
* ```
*
* Modyfing `pageOptions` is supported only in Playwright incognito.
* See {@link PrePageCreateHook}
*/
preNavigationHooks?: PlaywrightHook[];
/**
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
* The function accepts `crawlingContext` as the only parameter.
* Example:
* ```
* postNavigationHooks: [
* async (crawlingContext) => {
* const { page } = crawlingContext;
* if (hasCaptcha(page)) {
* await solveCaptcha (page);
* }
* },
* ]
* ```
*/
postNavigationHooks?: PlaywrightHook[];
}
/**
* Provides a simple framework for parallel crawling of web pages
* using headless Chromium, Firefox and Webkit browsers with [Playwright](https://github.com/microsoft/playwright).
* The URLs to crawl are fed either from a static list of URLs
* or from a dynamic queue of URLs enabling recursive crawling of websites.
*
* Since `Playwright` uses headless browser to download web pages and extract data,
* it is useful for crawling of websites that require to execute JavaScript.
* If the target website doesn't need JavaScript, consider using {@link CheerioCrawler},
* which downloads the pages using raw HTTP requests and is about 10x faster.
*
* The source URLs are represented using {@link Request} objects that are fed from
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link PlaywrightCrawlerOptions.requestList}
* or {@link PlaywrightCrawlerOptions.requestQueue} constructor options, respectively.
*
* If both {@link PlaywrightCrawlerOptions.requestList} and {@link PlaywrightCrawlerOptions.requestQueue} are used,
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@link Request} objects to crawl.
*
* `PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each {@link Request} object to crawl
* and then calls the function provided by user as the {@link PlaywrightCrawlerOptions.requestHandler} option.
*
* New pages are only opened when there is enough free CPU and memory available,
* using the functionality provided by the {@link AutoscaledPool} class.
* All {@link AutoscaledPool} configuration options can be passed to the {@link PlaywrightCrawlerOptions.autoscaledPoolOptions}
* parameter of the `PlaywrightCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
* {@link AutoscaledPoolOptions} are available directly in the `PlaywrightCrawler` constructor.
*
* Note that the pool of Playwright instances is internally managed by the [BrowserPool](https://github.com/apify/browser-pool) class.
*
* **Example usage:**
*
* ```javascript
* const crawler = new PlaywrightCrawler({
* async requestHandler({ page, request }) {
* // This function is called to extract data from a single web page
* // 'page' is an instance of Playwright.Page with page.goto(request.url) already called
* // 'request' is an instance of Request class with information about the page to load
* await Dataset.pushData({
* title: await page.title(),
* url: request.url,
* succeeded: true,
* })
* },
* async failedRequestHandler({ request }) {
* // This function is called when the crawling of a request failed too many times
* await Dataset.pushData({
* url: request.url,
* succeeded: false,
* errors: request.errorMessages,
* })
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
* @category Crawlers
*/
export declare class PlaywrightCrawler extends BrowserCrawler<{
browserPlugins: [PlaywrightPlugin];
}, LaunchOptions, PlaywrightCrawlingContext> {
private readonly options;
readonly config: Configuration;
protected static optionsShape: {
// @ts-ignore optional peer dependency or compatibility with es2022
browserPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
launcher: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handlePageFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
launchContext: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
headless: import("ow").AnyPredicate<string | boolean>;
// @ts-ignore optional peer dependency or compatibility with es2022
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
ignoreShadowRoots: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
ignoreIframes: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestQueue: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handleRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestHandlerTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handleRequestTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
errorHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
failedRequestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handleFailedRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
// @ts-ignore optional peer dependency or compatibility with es2022
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
};
/**
* All `PlaywrightCrawler` parameters are passed via an options object.
*/
constructor(options?: PlaywrightCrawlerOptions, config?: Configuration);
protected _runRequestHandler(context: PlaywrightCrawlingContext): Promise<void>;
protected _navigationHandler(crawlingContext: PlaywrightCrawlingContext, gotoOptions: DirectNavigationOptions): Promise<Response | null>;
}
/**
* Creates new {@link Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@link PlaywrightCrawler}.
* Defaults to the {@link PlaywrightCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<PlaywrightCrawlingContext>()`.
*
* ```ts
* import { PlaywrightCrawler, createPlaywrightRouter } from 'crawlee';
*
* const router = createPlaywrightRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new PlaywrightCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
// @ts-ignore optional peer dependency or compatibility with es2022
export declare function createPlaywrightRouter<Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/browser").RouterHandler<Context>;
//# sourceMappingURL=playwright-crawler.d.ts.map