UNPKG

@crawlee/browser

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

391 lines • 25.4 kB
import type { Awaitable, BasicCrawlerOptions, BasicCrawlingContext, CrawlingContext, Dictionary, EnqueueLinksOptions, ErrorHandler, LoadedContext, ProxyConfiguration, RequestHandler, RequestProvider, SkippedRequestCallback } from '@crawlee/basic'; import { BasicCrawler, Configuration } from '@crawlee/basic'; import type { BrowserController, BrowserPlugin, BrowserPoolHooks, BrowserPoolOptions, CommonPage, InferBrowserPluginArray, LaunchContext } from '@crawlee/browser-pool'; import { BrowserPool } from '@crawlee/browser-pool'; import type { RobotsTxtFile } from '@crawlee/utils'; import type { ReadonlyDeep } from 'type-fest'; import type { BrowserLaunchContext } from './browser-launcher'; export interface BrowserCrawlingContext<Crawler = unknown, Page extends CommonPage = CommonPage, Response = Dictionary, ProvidedController = BrowserController, UserData extends Dictionary = Dictionary> extends CrawlingContext<Crawler, UserData> { browserController: ProvidedController; page: Page; response?: Response; } export type BrowserRequestHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = RequestHandler<Context>; export type BrowserErrorHandler<Context extends BrowserCrawlingContext = BrowserCrawlingContext> = ErrorHandler<Context>; export type BrowserHook<Context = BrowserCrawlingContext, GoToOptions extends Dictionary | undefined = Dictionary> = (crawlingContext: Context, gotoOptions: GoToOptions) => Awaitable<void>; export interface BrowserCrawlerOptions<Context extends BrowserCrawlingContext = BrowserCrawlingContext, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>> extends Omit<BasicCrawlerOptions, 'requestHandler' | 'handleRequestFunction' | 'failedRequestHandler' | 'handleFailedRequestFunction' | 'errorHandler'> { launchContext?: BrowserLaunchContext<any, any>; /** * Function that is called to process each request. * * The function receives the {@link BrowserCrawlingContext} * (actual context will be enhanced with the crawler specific properties) as an argument, where: * - {@link BrowserCrawlingContext.request|`request`} is an instance of the {@link Request} object * with details about the URL to open, HTTP method etc; * - {@link BrowserCrawlingContext.page|`page`} is an instance of the * Puppeteer [Page](https://pptr.dev/api/puppeteer.page) or * Playwright [Page](https://playwright.dev/docs/api/class-page); * - {@link BrowserCrawlingContext.browserController|`browserController`} is an instance of the {@link BrowserController}; * - {@link BrowserCrawlingContext.response|`response`} is an instance of the * Puppeteer [Response](https://pptr.dev/api/puppeteer.httpresponse) or * Playwright [Response](https://playwright.dev/docs/api/class-response), * which is the main resource response as returned by the respective `page.goto()` function. * * The function must return a promise, which is then awaited by the crawler. * * If the function throws an exception, the crawler will try to re-crawl the * request later, up to the {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times. * If all the retries fail, the crawler calls the function * provided to the {@link BrowserCrawlerOptions.failedRequestHandler|`failedRequestHandler`} parameter. * To make this work, we should **always** * let our function throw exceptions rather than catch them. * The exceptions are logged to the request using the * {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function. */ requestHandler?: BrowserRequestHandler<LoadedContext<Context>>; /** * Function that is called to process each request. * * The function receives the {@link BrowserCrawlingContext} * (actual context will be enhanced with the crawler specific properties) as an argument, where: * - {@link BrowserCrawlingContext.request|`request`} is an instance of the {@link Request} object * with details about the URL to open, HTTP method etc; * - {@link BrowserCrawlingContext.page|`page`} is an instance of the * Puppeteer [Page](https://pptr.dev/api/puppeteer.page) or * Playwright [Page](https://playwright.dev/docs/api/class-page); * - {@link BrowserCrawlingContext.browserController|`browserController`} is an instance of the {@link BrowserController}; * - {@link BrowserCrawlingContext.response|`response`} is an instance of the * Puppeteer [Response](https://pptr.dev/api/puppeteer.httpresponse) or * Playwright [Response](https://playwright.dev/docs/api/class-response), * which is the main resource response as returned by the respective `page.goto()` function. * * The function must return a promise, which is then awaited by the crawler. * * If the function throws an exception, the crawler will try to re-crawl the * request later, up to the {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times. * If all the retries fail, the crawler calls the function * provided to the {@link BrowserCrawlerOptions.failedRequestHandler|`failedRequestHandler`} parameter. * To make this work, we should **always** * let our function throw exceptions rather than catch them. * The exceptions are logged to the request using the * {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function. * * @deprecated `handlePageFunction` has been renamed to `requestHandler` and will be removed in a future version. * @ignore */ handlePageFunction?: BrowserRequestHandler<LoadedContext<Context>>; /** * User-provided function that allows modifying the request object before it gets retried by the crawler. * It's executed before each retry for the requests that failed less than {@link BrowserCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times. * * The function receives the {@link BrowserCrawlingContext} * (actual context will be enhanced with the crawler specific properties) as the first argument, * where the {@link BrowserCrawlingContext.request|`request`} corresponds to the request to be retried. * Second argument is the `Error` instance that * represents the last error thrown during processing of the request. */ errorHandler?: BrowserErrorHandler<Context>; /** * A function to handle requests that failed more than `option.maxRequestRetries` times. * * The function receives the {@link BrowserCrawlingContext} * (actual context will be enhanced with the crawler specific properties) as the first argument, * where the {@link BrowserCrawlingContext.request|`request`} corresponds to the failed request. * Second argument is the `Error` instance that * represents the last error thrown during processing of the request. */ failedRequestHandler?: BrowserErrorHandler<Context>; /** * A function to handle requests that failed more than `option.maxRequestRetries` times. * * The function receives the {@link BrowserCrawlingContext} * (actual context will be enhanced with the crawler specific properties) as the first argument, * where the {@link BrowserCrawlingContext.request|`request`} corresponds to the failed request. * Second argument is the `Error` instance that * represents the last error thrown during processing of the request. * * @deprecated `handleFailedRequestFunction` has been renamed to `failedRequestHandler` and will be removed in a future version. * @ignore */ handleFailedRequestFunction?: BrowserErrorHandler<Context>; /** * Custom options passed to the underlying {@link BrowserPool} constructor. * We can tweak those to fine-tune browser management. */ browserPoolOptions?: Partial<BrowserPoolOptions> & Partial<BrowserPoolHooks<__BrowserControllerReturn, __LaunchContextReturn>>; /** * If set, the crawler will be configured for all connections to use * the Proxy URLs provided and rotated according to the configuration. */ proxyConfiguration?: ProxyConfiguration; /** * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`, * which are passed to the `page.goto()` function the crawler calls to navigate. * * **Example:** * * ```js * preNavigationHooks: [ * async (crawlingContext, gotoOptions) => { * const { page } = crawlingContext; * await page.evaluate((attr) => { window.foo = attr; }, 'bar'); * gotoOptions.timeout = 60_000; * gotoOptions.waitUntil = 'domcontentloaded'; * }, * ] * ``` * * Modyfing `pageOptions` is supported only in Playwright incognito. * See {@link PrePageCreateHook} */ preNavigationHooks?: BrowserHook<Context>[]; /** * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. * The function accepts `crawlingContext` as the only parameter. * * **Example:** * * ```js * postNavigationHooks: [ * async (crawlingContext) => { * const { page } = crawlingContext; * if (hasCaptcha(page)) { * await solveCaptcha(page); * } * }, * ] * ``` */ postNavigationHooks?: BrowserHook<Context>[]; /** * Timeout in which page navigation needs to finish, in seconds. */ navigationTimeoutSecs?: number; /** * Defines whether the cookies should be persisted for sessions. * This can only be used when `useSessionPool` is set to `true`. */ persistCookiesPerSession?: boolean; /** * Whether to run browser in headless mode. Defaults to `true`. * Can be also set via {@link Configuration}. */ headless?: boolean | 'new' | 'old'; /** * Whether to ignore custom elements (and their #shadow-roots) when processing the page content via `parseWithCheerio` helper. * By default, they are expanded automatically. Use this option to disable this behavior. */ ignoreShadowRoots?: boolean; /** * Whether to ignore `iframes` when processing the page content via `parseWithCheerio` helper. * By default, `iframes` are expanded automatically. Use this option to disable this behavior. */ ignoreIframes?: boolean; } /** * Provides a simple framework for parallel crawling of web pages * using headless browsers with [Puppeteer](https://github.com/puppeteer/puppeteer) * and [Playwright](https://github.com/microsoft/playwright). * The URLs to crawl are fed either from a static list of URLs * or from a dynamic queue of URLs enabling recursive crawling of websites. * * Since `BrowserCrawler` uses headless (or even headful) browsers to download web pages and extract data, * it is useful for crawling of websites that require to execute JavaScript. * If the target website doesn't need JavaScript, we should consider using the {@link CheerioCrawler}, * which downloads the pages using raw HTTP requests and is about 10x faster. * * The source URLs are represented by the {@link Request} objects that are fed from the {@link RequestList} or {@link RequestQueue} instances * provided by the {@link BrowserCrawlerOptions.requestList|`requestList`} or {@link BrowserCrawlerOptions.requestQueue|`requestQueue`} * constructor options, respectively. If neither `requestList` nor `requestQueue` options are provided, * the crawler will open the default request queue either when the {@link BrowserCrawler.addRequests|`crawler.addRequests()`} function is called, * or if `requests` parameter (representing the initial requests) of the {@link BrowserCrawler.run|`crawler.run()`} function is provided. * * If both {@link BrowserCrawlerOptions.requestList|`requestList`} and {@link BrowserCrawlerOptions.requestQueue|`requestQueue`} options are used, * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them * to the {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * `BrowserCrawler` opens a new browser page (i.e. tab or window) for each {@link Request} object to crawl * and then calls the function provided by user as the {@link BrowserCrawlerOptions.requestHandler|`requestHandler`} option. * * New pages are only opened when there is enough free CPU and memory available, * using the functionality provided by the {@link AutoscaledPool} class. * All {@link AutoscaledPool} configuration options can be passed to the {@link BrowserCrawlerOptions.autoscaledPoolOptions|`autoscaledPoolOptions`} * parameter of the `BrowserCrawler` constructor. * For user convenience, the {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} and * {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} options of the * underlying {@link AutoscaledPool} constructor are available directly in the `BrowserCrawler` constructor. * * > *NOTE:* the pool of browser instances is internally managed by the {@link BrowserPool} class. * * @category Crawlers */ export declare abstract class BrowserCrawler<InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, LaunchOptions extends Dictionary | undefined = Dictionary, Context extends BrowserCrawlingContext = BrowserCrawlingContext, GoToOptions extends Dictionary = Dictionary> extends BasicCrawler<Context> { readonly config: Configuration; /** * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies. * Only available if used by the crawler. */ proxyConfiguration?: ProxyConfiguration; /** * A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers. */ browserPool: BrowserPool<InternalBrowserPoolOptions>; launchContext: BrowserLaunchContext<LaunchOptions, unknown>; protected userProvidedRequestHandler: BrowserRequestHandler<Context>; protected navigationTimeoutMillis: number; protected requestHandlerTimeoutInnerMillis: number; protected preNavigationHooks: BrowserHook<Context>[]; protected postNavigationHooks: BrowserHook<Context>[]; protected persistCookiesPerSession: boolean; protected static optionsShape: { // @ts-ignore optional peer dependency or compatibility with es2022 handlePageFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 launchContext: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 headless: import("ow").AnyPredicate<string | boolean>; // @ts-ignore optional peer dependency or compatibility with es2022 browserPoolOptions: import("ow").ObjectPredicate<object>; // @ts-ignore optional peer dependency or compatibility with es2022 sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 ignoreShadowRoots: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 ignoreIframes: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestQueue: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestHandlerTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleRequestTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 errorHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 failedRequestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleFailedRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 respectRobotsTxtFile: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; }; /** * All `BrowserCrawler` parameters are passed via an options object. */ protected constructor(options?: BrowserCrawlerOptions<Context>, config?: Configuration); protected _cleanupContext(crawlingContext: Context): Promise<void>; private containsSelectors; protected isRequestBlocked(crawlingContext: Context): Promise<string | false>; /** * Wrapper around requestHandler that opens and closes pages etc. */ protected _runRequestHandler(crawlingContext: Context): Promise<void>; protected _enhanceCrawlingContextWithPageInfo(crawlingContext: Context, page: CommonPage, createNewSession?: boolean): void; protected _handleNavigation(crawlingContext: Context): Promise<void>; protected _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string): Promise<void>; /** * Marks session bad in case of navigation timeout. */ protected _handleNavigationTimeout(crawlingContext: Context, error: Error): Promise<void>; /** * Transforms proxy-related errors to `SessionError`. */ protected _throwIfProxyError(error: Error): void; protected abstract _navigationHandler(crawlingContext: Context, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>; /** * Should be overridden in case of different automation library that does not support this response API. */ protected _responseHandler(crawlingContext: Context): Promise<void>; protected _extendLaunchContext(_pageId: string, launchContext: LaunchContext): Promise<void>; protected _maybeAddSessionRetiredListener(_pageId: string, browserController: Context['browserController']): void; /** * Function for cleaning up after all requests are processed. * @ignore */ teardown(): Promise<void>; } /** @internal */ interface EnqueueLinksInternalOptions { options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>; page: CommonPage; requestQueue: RequestProvider; robotsTxtFile?: RobotsTxtFile; onSkippedRequest?: SkippedRequestCallback; originalRequestUrl: string; finalRequestUrl?: string; } /** @internal */ interface BoundEnqueueLinksInternalOptions { enqueueLinks: BasicCrawlingContext['enqueueLinks']; options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>; originalRequestUrl: string; finalRequestUrl?: string; page: CommonPage; } /** @internal */ // @ts-ignore optional peer dependency or compatibility with es2022 export declare function browserCrawlerEnqueueLinks(options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>; /** * Extracts URLs from a given page. * @ignore */ export declare function extractUrlsFromPage(page: { $$eval: Function; }, selector: string, baseUrl: string): Promise<string[]>; export {}; //# sourceMappingURL=browser-crawler.d.ts.map