UNPKG

@crawlee/playwright

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

204 lines • 10.7 kB
import type { BrowserHook, LoadedContext, RouterHandler } from '@crawlee/browser'; import type { BaseHttpResponseData, EnqueueLinksOptions, GetUserDataFromRequest, RestrictedCrawlingContext, RouterRoutes, StatisticsOptions, StatisticState } from '@crawlee/core'; import { Configuration, RequestHandlerResult, Statistics } from '@crawlee/core'; import type { Awaitable, BatchAddRequestsResult, Dictionary } from '@crawlee/types'; import { type CheerioRoot } from '@crawlee/utils'; import { type Cheerio, type Element } from 'cheerio'; // @ts-ignore optional peer dependency or compatibility with es2022 import type { Page } from 'playwright'; import type { SetRequired } from 'type-fest'; import type { Log } from '@apify/log'; import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwright-crawler'; import { PlaywrightCrawler } from './playwright-crawler'; import { RenderingTypePredictor } from './utils/rendering-type-prediction'; type Result<TResult> = { result: TResult; ok: true; logs?: LogProxyCall[]; } | { error: unknown; ok: false; logs?: LogProxyCall[]; }; interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState { httpOnlyRequestHandlerRuns?: number; browserRequestHandlerRuns?: number; renderingTypeMispredictions?: number; } declare class AdaptivePlaywrightCrawlerStatistics extends Statistics { state: AdaptivePlaywrightCrawlerStatisticState; constructor(options?: StatisticsOptions); reset(): void; protected _maybeLoadStatistics(): Promise<void>; trackHttpOnlyRequestHandlerRun(): void; trackBrowserRequestHandlerRun(): void; trackRenderingTypeMisprediction(): void; } export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = Dictionary> extends RestrictedCrawlingContext<UserData> { /** * The HTTP response, either from the HTTP client or from the initial request from playwright's navigation. */ response: BaseHttpResponseData; /** * Playwright Page object. If accessed in HTTP-only rendering, this will throw an error and make the AdaptivePlaywrightCrawlerContext retry the request in a browser. */ page: Page; /** * Wait for an element matching the selector to appear and return a Cheerio object of matched elements. * Timeout defaults to 5s. */ querySelector(selector: string, timeoutMs?: number): Promise<Cheerio<Element>>; /** * Wait for an element matching the selector to appear. * Timeout defaults to 5s. * * **Example usage:** * ```ts * async requestHandler({ waitForSelector, parseWithCheerio }) { * await waitForSelector('article h1'); * const $ = await parseWithCheerio(); * const title = $('title').text(); * }); * ``` */ waitForSelector(selector: string, timeoutMs?: number): Promise<void>; /** * Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}. * When provided with the `selector` argument, it will first look for the selector with a 5s timeout. * * **Example usage:** * ```ts * async requestHandler({ parseWithCheerio }) { * const $ = await parseWithCheerio(); * const title = $('title').text(); * }); * ``` */ parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>; } interface AdaptiveHook extends BrowserHook<Pick<AdaptivePlaywrightCrawlerContext, 'id' | 'request' | 'session' | 'proxyInfo' | 'log'> & { page?: Page; }, PlaywrightGotoOptions> { } export interface AdaptivePlaywrightCrawlerOptions extends Omit<PlaywrightCrawlerOptions, 'requestHandler' | 'handlePageFunction' | 'preNavigationHooks' | 'postNavigationHooks'> { /** * Function that is called to process each request. * * The function receives the {@link AdaptivePlaywrightCrawlingContext} as an argument, and it must refrain from calling code with side effects, * other than the methods of the crawling context. Any other side effects may be invoked repeatedly by the crawler, which can lead to inconsistent results. * * The function must return a promise, which is then awaited by the crawler. * * If the function throws an exception, the crawler will try to re-crawl the * request later, up to `option.maxRequestRetries` times. */ requestHandler?: (crawlingContext: LoadedContext<AdaptivePlaywrightCrawlerContext>) => Awaitable<void>; /** * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies. * The function accepts a subset of the crawling context. If you attempt to access the `page` property during HTTP-only crawling, * an exception will be thrown. If it's not caught, the request will be transparently retried in a browser. */ preNavigationHooks?: AdaptiveHook[]; /** * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. * The function accepts a subset of the crawling context. If you attempt to access the `page` property during HTTP-only crawling, * an exception will be thrown. If it's not caught, the request will be transparently retried in a browser. */ postNavigationHooks?: AdaptiveHook[]; /** * Specifies the frequency of rendering type detection checks - 0.1 means roughly 10% of requests. * Defaults to 0.1 (so 10%). */ renderingTypeDetectionRatio?: number; /** * An optional callback that is called on dataset items found by the request handler in plain HTTP mode. * If it returns false, the request is retried in a browser. * If no callback is specified, every dataset item is considered valid. */ resultChecker?: (result: RequestHandlerResult) => boolean; /** * An optional callback used in rendering type detection. On each detection, the result of the plain HTTP run is compared to that of the browser one. * If a callback is provided, the contract is as follows: * It the callback returns true or 'equal', the results are considered equal and the target site is considered static. * If it returns false or 'different', the target site is considered client-rendered. * If it returns 'inconclusive', the detection result won't be used. * If no result comparator is specified, but there is a `resultChecker`, any site where the `resultChecker` returns true is considered static. * If neither `resultComparator` nor `resultChecker` are specified, a deep comparison of returned dataset items is used as a default. */ resultComparator?: (resultA: RequestHandlerResult, resultB: RequestHandlerResult) => boolean | 'equal' | 'different' | 'inconclusive'; /** * A custom rendering type predictor */ renderingTypePredictor?: Pick<RenderingTypePredictor, 'predict' | 'storeResult' | 'initialize'>; /** * Prevent direct access to storage in request handlers (only allow using context helpers). * Defaults to `true` */ preventDirectStorageAccess?: boolean; } declare const proxyLogMethods: readonly ["error", "exception", "softFail", "info", "debug", "perf", "warningOnce", "deprecated"]; type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args: unknown[]]; /** * An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. * * **Example usage:** * * ```javascript * const crawler = new AdaptivePlaywrightCrawler({ * renderingTypeDetectionRatio: 0.1, * async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) { * // This function is called to extract data from a single web page * const $prices = await querySelector('span.price') * * await pushData({ * url: request.url, * price: $prices.filter(':contains("$")').first().text(), * }) * * await enqueueLinks({ selector: '.pagination a' }) * }, * }); * * await crawler.run([ * 'http://www.example.com/page-1', * 'http://www.example.com/page-2', * ]); * ``` * * @experimental */ export declare class AdaptivePlaywrightCrawler extends PlaywrightCrawler { readonly config: Configuration; private adaptiveRequestHandler; private renderingTypePredictor; private resultChecker; private resultComparator; private preventDirectStorageAccess; readonly stats: AdaptivePlaywrightCrawlerStatistics; private inFlightRenderingTypeDetections; /** * Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}. * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}. */ // @ts-ignore optional peer dependency or compatibility with es2022 readonly router: RouterHandler<AdaptivePlaywrightCrawlerContext>; constructor(options?: AdaptivePlaywrightCrawlerOptions, config?: Configuration); /** * Returns the number of rendering type detections currently in progress. */ get inFlightRenderingTypeDetectionCount(): number; protected _init(): Promise<void>; protected _runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>; protected commitResult(crawlingContext: PlaywrightCrawlingContext, { calls, keyValueStoreChanges }: RequestHandlerResult): Promise<void>; protected allowStorageAccess<R, TArgs extends any[]>(func: (...args: TArgs) => Promise<R>): (...args: TArgs) => Promise<R>; protected runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise<{ result: Result<RequestHandlerResult>; initialStateCopy?: Record<string, unknown>; }>; protected runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext, oldStateCopy?: Dictionary): Promise<Result<RequestHandlerResult>>; protected enqueueLinks(options: SetRequired<EnqueueLinksOptions, 'urls'>, request: RestrictedCrawlingContext['request'], result: RequestHandlerResult): Promise<BatchAddRequestsResult>; private createLogProxy; } export declare function createAdaptivePlaywrightRouter<Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): RouterHandler<Context>; export {}; //# sourceMappingURL=adaptive-playwright-crawler.d.ts.map