UNPKG

@crawlee/playwright

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

299 lines • 18.2 kB
import type { BrowserCrawlerOptions, BrowserCrawlingContext, BrowserHook, BrowserRequestHandler, GetUserDataFromRequest, LoadedContext, RouterRoutes } from '@crawlee/browser'; import { BrowserCrawler, Configuration } from '@crawlee/browser'; import type { PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool'; import type { Dictionary } from '@crawlee/types'; // @ts-ignore optional peer dependency or compatibility with es2022 import type { LaunchOptions, Page, Response } from 'playwright'; import type { PlaywrightLaunchContext } from './playwright-launcher'; import type { DirectNavigationOptions, PlaywrightContextUtils } from './utils/playwright-utils'; export interface PlaywrightCrawlingContext<UserData extends Dictionary = Dictionary> extends BrowserCrawlingContext<PlaywrightCrawler, Page, Response, PlaywrightController, UserData>, PlaywrightContextUtils { } // @ts-ignore optional peer dependency or compatibility with es2022 export interface PlaywrightHook extends BrowserHook<PlaywrightCrawlingContext, PlaywrightGotoOptions> { } export interface PlaywrightRequestHandler extends BrowserRequestHandler<LoadedContext<PlaywrightCrawlingContext>> { } export type PlaywrightGotoOptions = Dictionary & Parameters<Page['goto']>[1]; export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions<PlaywrightCrawlingContext, { browserPlugins: [PlaywrightPlugin]; }> { /** * The same options as used by {@link launchPlaywright}. */ launchContext?: PlaywrightLaunchContext; /** * Function that is called to process each request. * * The function receives the {@link PlaywrightCrawlingContext} as an argument, where: * - `request` is an instance of the {@link Request} object with details about the URL to open, HTTP method etc. * - `page` is an instance of the `Playwright` * [`Page`](https://playwright.dev/docs/api/class-page) * - `browserController` is an instance of the * [`BrowserController`](https://github.com/apify/browser-pool#browsercontroller), * - `response` is an instance of the `Playwright` * [`Response`](https://playwright.dev/docs/api/class-response), * which is the main resource response as returned by `page.goto(request.url)`. * * The function must return a promise, which is then awaited by the crawler. * * If the function throws an exception, the crawler will try to re-crawl the * request later, up to `option.maxRequestRetries` times. * If all the retries fail, the crawler calls the function * provided to the `failedRequestHandler` parameter. * To make this work, you should **always** * let your function throw exceptions rather than catch them. * The exceptions are logged to the request using the * {@link Request.pushErrorMessage} function. */ requestHandler?: PlaywrightRequestHandler; /** * Function that is called to process each request. * * The function receives the {@link PlaywrightCrawlingContext} as an argument, where: * - `request` is an instance of the {@link Request} object with details about the URL to open, HTTP method etc. * - `page` is an instance of the `Playwright` * [`Page`](https://playwright.dev/docs/api/class-page) * - `browserController` is an instance of the * [`BrowserController`](https://github.com/apify/browser-pool#browsercontroller), * - `response` is an instance of the `Playwright` * [`Response`](https://playwright.dev/docs/api/class-response), * which is the main resource response as returned by `page.goto(request.url)`. * * The function must return a promise, which is then awaited by the crawler. * * If the function throws an exception, the crawler will try to re-crawl the * request later, up to `option.maxRequestRetries` times. * If all the retries fail, the crawler calls the function * provided to the `failedRequestHandler` parameter. * To make this work, you should **always** * let your function throw exceptions rather than catch them. * The exceptions are logged to the request using the * {@link Request.pushErrorMessage} function. * * @deprecated `handlePageFunction` has been renamed to `requestHandler` and will be removed in a future version. * @ignore */ handlePageFunction?: PlaywrightRequestHandler; /** * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`, * which are passed to the `page.goto()` function the crawler calls to navigate. * Example: * ``` * preNavigationHooks: [ * async (crawlingContext, gotoOptions) => { * const { page } = crawlingContext; * await page.evaluate((attr) => { window.foo = attr; }, 'bar'); * }, * ] * ``` * * Modyfing `pageOptions` is supported only in Playwright incognito. * See {@link PrePageCreateHook} */ preNavigationHooks?: PlaywrightHook[]; /** * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. * The function accepts `crawlingContext` as the only parameter. * Example: * ``` * postNavigationHooks: [ * async (crawlingContext) => { * const { page } = crawlingContext; * if (hasCaptcha(page)) { * await solveCaptcha (page); * } * }, * ] * ``` */ postNavigationHooks?: PlaywrightHook[]; } /** * Provides a simple framework for parallel crawling of web pages * using headless Chromium, Firefox and Webkit browsers with [Playwright](https://github.com/microsoft/playwright). * The URLs to crawl are fed either from a static list of URLs * or from a dynamic queue of URLs enabling recursive crawling of websites. * * Since `Playwright` uses headless browser to download web pages and extract data, * it is useful for crawling of websites that require to execute JavaScript. * If the target website doesn't need JavaScript, consider using {@link CheerioCrawler}, * which downloads the pages using raw HTTP requests and is about 10x faster. * * The source URLs are represented using {@link Request} objects that are fed from * {@link RequestList} or {@link RequestQueue} instances provided by the {@link PlaywrightCrawlerOptions.requestList} * or {@link PlaywrightCrawlerOptions.requestQueue} constructor options, respectively. * * If both {@link PlaywrightCrawlerOptions.requestList} and {@link PlaywrightCrawlerOptions.requestQueue} are used, * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * `PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each {@link Request} object to crawl * and then calls the function provided by user as the {@link PlaywrightCrawlerOptions.requestHandler} option. * * New pages are only opened when there is enough free CPU and memory available, * using the functionality provided by the {@link AutoscaledPool} class. * All {@link AutoscaledPool} configuration options can be passed to the {@link PlaywrightCrawlerOptions.autoscaledPoolOptions} * parameter of the `PlaywrightCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` * {@link AutoscaledPoolOptions} are available directly in the `PlaywrightCrawler` constructor. * * Note that the pool of Playwright instances is internally managed by the [BrowserPool](https://github.com/apify/browser-pool) class. * * **Example usage:** * * ```javascript * const crawler = new PlaywrightCrawler({ * async requestHandler({ page, request }) { * // This function is called to extract data from a single web page * // 'page' is an instance of Playwright.Page with page.goto(request.url) already called * // 'request' is an instance of Request class with information about the page to load * await Dataset.pushData({ * title: await page.title(), * url: request.url, * succeeded: true, * }) * }, * async failedRequestHandler({ request }) { * // This function is called when the crawling of a request failed too many times * await Dataset.pushData({ * url: request.url, * succeeded: false, * errors: request.errorMessages, * }) * }, * }); * * await crawler.run([ * 'http://www.example.com/page-1', * 'http://www.example.com/page-2', * ]); * ``` * @category Crawlers */ export declare class PlaywrightCrawler extends BrowserCrawler<{ browserPlugins: [PlaywrightPlugin]; }, LaunchOptions, PlaywrightCrawlingContext> { private readonly options; readonly config: Configuration; protected static optionsShape: { // @ts-ignore optional peer dependency or compatibility with es2022 browserPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 launcher: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handlePageFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 launchContext: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 headless: import("ow").AnyPredicate<string | boolean>; // @ts-ignore optional peer dependency or compatibility with es2022 sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 ignoreShadowRoots: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 ignoreIframes: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestQueue: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestHandlerTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleRequestTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 errorHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 failedRequestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleFailedRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>; // @ts-ignore optional peer dependency or compatibility with es2022 onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; }; /** * All `PlaywrightCrawler` parameters are passed via an options object. */ constructor(options?: PlaywrightCrawlerOptions, config?: Configuration); protected _runRequestHandler(context: PlaywrightCrawlingContext): Promise<void>; protected _navigationHandler(crawlingContext: PlaywrightCrawlingContext, gotoOptions: DirectNavigationOptions): Promise<Response | null>; } /** * Creates new {@link Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@link PlaywrightCrawler}. * Defaults to the {@link PlaywrightCrawlingContext}. * * > Serves as a shortcut for using `Router.create<PlaywrightCrawlingContext>()`. * * ```ts * import { PlaywrightCrawler, createPlaywrightRouter } from 'crawlee'; * * const router = createPlaywrightRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new PlaywrightCrawler({ * requestHandler: router, * }); * await crawler.run(); * ``` */ // @ts-ignore optional peer dependency or compatibility with es2022 export declare function createPlaywrightRouter<Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/browser").RouterHandler<Context>; //# sourceMappingURL=playwright-crawler.d.ts.map