@crawlee/playwright
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
204 lines • 10.7 kB
TypeScript
import type { BrowserHook, LoadedContext, RouterHandler } from '@crawlee/browser';
import type { BaseHttpResponseData, EnqueueLinksOptions, GetUserDataFromRequest, RestrictedCrawlingContext, RouterRoutes, StatisticsOptions, StatisticState } from '@crawlee/core';
import { Configuration, RequestHandlerResult, Statistics } from '@crawlee/core';
import type { Awaitable, BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import { type CheerioRoot } from '@crawlee/utils';
import { type Cheerio, type Element } from 'cheerio';
// @ts-ignore optional peer dependency or compatibility with es2022
import type { Page } from 'playwright';
import type { SetRequired } from 'type-fest';
import type { Log } from '@apify/log';
import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwright-crawler';
import { PlaywrightCrawler } from './playwright-crawler';
import { RenderingTypePredictor } from './utils/rendering-type-prediction';
type Result<TResult> = {
result: TResult;
ok: true;
logs?: LogProxyCall[];
} | {
error: unknown;
ok: false;
logs?: LogProxyCall[];
};
interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState {
httpOnlyRequestHandlerRuns?: number;
browserRequestHandlerRuns?: number;
renderingTypeMispredictions?: number;
}
declare class AdaptivePlaywrightCrawlerStatistics extends Statistics {
state: AdaptivePlaywrightCrawlerStatisticState;
constructor(options?: StatisticsOptions);
reset(): void;
protected _maybeLoadStatistics(): Promise<void>;
trackHttpOnlyRequestHandlerRun(): void;
trackBrowserRequestHandlerRun(): void;
trackRenderingTypeMisprediction(): void;
}
export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = Dictionary> extends RestrictedCrawlingContext<UserData> {
/**
* The HTTP response, either from the HTTP client or from the initial request from playwright's navigation.
*/
response: BaseHttpResponseData;
/**
* Playwright Page object. If accessed in HTTP-only rendering, this will throw an error and make the AdaptivePlaywrightCrawlerContext retry the request in a browser.
*/
page: Page;
/**
* Wait for an element matching the selector to appear and return a Cheerio object of matched elements.
* Timeout defaults to 5s.
*/
querySelector(selector: string, timeoutMs?: number): Promise<Cheerio<Element>>;
/**
* Wait for an element matching the selector to appear.
* Timeout defaults to 5s.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;
/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}.
* When provided with the `selector` argument, it will first look for the selector with a 5s timeout.
*
* **Example usage:**
* ```ts
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}
interface AdaptiveHook extends BrowserHook<Pick<AdaptivePlaywrightCrawlerContext, 'id' | 'request' | 'session' | 'proxyInfo' | 'log'> & {
page?: Page;
}, PlaywrightGotoOptions> {
}
export interface AdaptivePlaywrightCrawlerOptions extends Omit<PlaywrightCrawlerOptions, 'requestHandler' | 'handlePageFunction' | 'preNavigationHooks' | 'postNavigationHooks'> {
/**
* Function that is called to process each request.
*
* The function receives the {@link AdaptivePlaywrightCrawlingContext} as an argument, and it must refrain from calling code with side effects,
* other than the methods of the crawling context. Any other side effects may be invoked repeatedly by the crawler, which can lead to inconsistent results.
*
* The function must return a promise, which is then awaited by the crawler.
*
* If the function throws an exception, the crawler will try to re-crawl the
* request later, up to `option.maxRequestRetries` times.
*/
requestHandler?: (crawlingContext: LoadedContext<AdaptivePlaywrightCrawlerContext>) => Awaitable<void>;
/**
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies.
* The function accepts a subset of the crawling context. If you attempt to access the `page` property during HTTP-only crawling,
* an exception will be thrown. If it's not caught, the request will be transparently retried in a browser.
*/
preNavigationHooks?: AdaptiveHook[];
/**
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
* The function accepts a subset of the crawling context. If you attempt to access the `page` property during HTTP-only crawling,
* an exception will be thrown. If it's not caught, the request will be transparently retried in a browser.
*/
postNavigationHooks?: AdaptiveHook[];
/**
* Specifies the frequency of rendering type detection checks - 0.1 means roughly 10% of requests.
* Defaults to 0.1 (so 10%).
*/
renderingTypeDetectionRatio?: number;
/**
* An optional callback that is called on dataset items found by the request handler in plain HTTP mode.
* If it returns false, the request is retried in a browser.
* If no callback is specified, every dataset item is considered valid.
*/
resultChecker?: (result: RequestHandlerResult) => boolean;
/**
* An optional callback used in rendering type detection. On each detection, the result of the plain HTTP run is compared to that of the browser one.
* If a callback is provided, the contract is as follows:
* It the callback returns true or 'equal', the results are considered equal and the target site is considered static.
* If it returns false or 'different', the target site is considered client-rendered.
* If it returns 'inconclusive', the detection result won't be used.
* If no result comparator is specified, but there is a `resultChecker`, any site where the `resultChecker` returns true is considered static.
* If neither `resultComparator` nor `resultChecker` are specified, a deep comparison of returned dataset items is used as a default.
*/
resultComparator?: (resultA: RequestHandlerResult, resultB: RequestHandlerResult) => boolean | 'equal' | 'different' | 'inconclusive';
/**
* A custom rendering type predictor
*/
renderingTypePredictor?: Pick<RenderingTypePredictor, 'predict' | 'storeResult' | 'initialize'>;
/**
* Prevent direct access to storage in request handlers (only allow using context helpers).
* Defaults to `true`
*/
preventDirectStorageAccess?: boolean;
}
declare const proxyLogMethods: readonly ["error", "exception", "softFail", "info", "debug", "perf", "warningOnce", "deprecated"];
type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args: unknown[]];
/**
* An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible.
*
* **Example usage:**
*
* ```javascript
* const crawler = new AdaptivePlaywrightCrawler({
* renderingTypeDetectionRatio: 0.1,
* async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) {
* // This function is called to extract data from a single web page
* const $prices = await querySelector('span.price')
*
* await pushData({
* url: request.url,
* price: $prices.filter(':contains("$")').first().text(),
* })
*
* await enqueueLinks({ selector: '.pagination a' })
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
*
* @experimental
*/
export declare class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
readonly config: Configuration;
private adaptiveRequestHandler;
private renderingTypePredictor;
private resultChecker;
private resultComparator;
private preventDirectStorageAccess;
readonly stats: AdaptivePlaywrightCrawlerStatistics;
private inFlightRenderingTypeDetections;
/**
* Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
*/
// @ts-ignore optional peer dependency or compatibility with es2022
readonly router: RouterHandler<AdaptivePlaywrightCrawlerContext>;
constructor(options?: AdaptivePlaywrightCrawlerOptions, config?: Configuration);
/**
* Returns the number of rendering type detections currently in progress.
*/
get inFlightRenderingTypeDetectionCount(): number;
protected _init(): Promise<void>;
protected _runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>;
protected commitResult(crawlingContext: PlaywrightCrawlingContext, { calls, keyValueStoreChanges }: RequestHandlerResult): Promise<void>;
protected allowStorageAccess<R, TArgs extends any[]>(func: (...args: TArgs) => Promise<R>): (...args: TArgs) => Promise<R>;
protected runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise<{
result: Result<RequestHandlerResult>;
initialStateCopy?: Record<string, unknown>;
}>;
protected runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext, oldStateCopy?: Dictionary): Promise<Result<RequestHandlerResult>>;
protected enqueueLinks(options: SetRequired<EnqueueLinksOptions, 'urls'>, request: RestrictedCrawlingContext['request'], result: RequestHandlerResult): Promise<BatchAddRequestsResult>;
private createLogProxy;
}
export declare function createAdaptivePlaywrightRouter<Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): RouterHandler<Context>;
export {};
//# sourceMappingURL=adaptive-playwright-crawler.d.ts.map