@crawlee/playwright
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
158 lines • 7.99 kB
TypeScript
import { type LoadedContext, type RouterHandler } from '@crawlee/browser';
import type { GetUserDataFromRequest, RestrictedCrawlingContext, RouterRoutes, StatisticsOptions, StatisticState } from '@crawlee/core';
import { Configuration, RequestHandlerResult, Statistics } from '@crawlee/core';
import type { Awaitable, Dictionary } from '@crawlee/types';
import { type CheerioRoot } from '@crawlee/utils';
import { type Cheerio, type Element } from 'cheerio';
import type { Log } from '@apify/log';
import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext } from './playwright-crawler';
import { PlaywrightCrawler } from './playwright-crawler';
import { RenderingTypePredictor } from './utils/rendering-type-prediction';
type Result<TResult> = {
result: TResult;
ok: true;
logs?: LogProxyCall[];
} | {
error: unknown;
ok: false;
logs?: LogProxyCall[];
};
interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState {
httpOnlyRequestHandlerRuns?: number;
browserRequestHandlerRuns?: number;
renderingTypeMispredictions?: number;
}
declare class AdaptivePlaywrightCrawlerStatistics extends Statistics {
state: AdaptivePlaywrightCrawlerStatisticState;
constructor(options?: StatisticsOptions);
reset(): void;
protected _maybeLoadStatistics(): Promise<void>;
trackHttpOnlyRequestHandlerRun(): void;
trackBrowserRequestHandlerRun(): void;
trackRenderingTypeMisprediction(): void;
}
export interface AdaptivePlaywrightCrawlerContext extends RestrictedCrawlingContext {
/**
* Wait for an element matching the selector to appear and return a Cheerio object of matched elements.
* Timeout defaults to 5s.
*/
querySelector(selector: string, timeoutMs?: number): Promise<Cheerio<Element>>;
/**
* Wait for an element matching the selector to appear.
* Timeout defaults to 5s.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;
/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}.
* When provided with the `selector` argument, it will first look for the selector with a 5s timeout.
*
* **Example usage:**
* ```ts
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}
export interface AdaptivePlaywrightCrawlerOptions extends Omit<PlaywrightCrawlerOptions, 'requestHandler' | 'handlePageFunction'> {
/**
* Function that is called to process each request.
*
* The function receives the {@apilink AdaptivePlaywrightCrawlingContext} as an argument, and it must refrain from calling code with side effects,
* other than the methods of the crawling context. Any other side effects may be invoked repeatedly by the crawler, which can lead to inconsistent results.
*
* The function must return a promise, which is then awaited by the crawler.
*
* If the function throws an exception, the crawler will try to re-crawl the
* request later, up to `option.maxRequestRetries` times.
*/
requestHandler?: (crawlingContext: LoadedContext<AdaptivePlaywrightCrawlerContext>) => Awaitable<void>;
/**
* Specifies the frequency of rendering type detection checks - 0.1 means roughly 10% of requests.
* Defaults to 0.1 (so 10%).
*/
renderingTypeDetectionRatio?: number;
/**
* An optional callback that is called on dataset items found by the request handler in plain HTTP mode.
* If it returns false, the request is retried in a browser.
* If no callback is specified, every dataset item is considered valid.
*/
resultChecker?: (result: RequestHandlerResult) => boolean;
/**
* An optional callback used in rendering type detection. On each detection, the result of the plain HTTP run is compared to that of the browser one.
* If the callback returns true, the results are considered equal and the target site is considered static.
* If no result comparator is specified, but there is a `resultChecker`, any site where the `resultChecker` returns true is considered static.
* If neither `resultComparator` nor `resultChecker` are specified, a deep comparison of returned dataset items is used as a default.
*/
resultComparator?: (resultA: RequestHandlerResult, resultB: RequestHandlerResult) => boolean;
/**
* A custom rendering type predictor
*/
renderingTypePredictor?: Pick<RenderingTypePredictor, 'predict' | 'storeResult'>;
}
declare const proxyLogMethods: readonly ["error", "exception", "softFail", "info", "debug", "perf", "warningOnce", "deprecated"];
type LogProxyCall = [log: Log, method: (typeof proxyLogMethods)[number], ...args: unknown[]];
/**
* An extension of {@apilink PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible.
*
* **Example usage:**
*
* ```javascript
* const crawler = new AdaptivePlaywrightCrawler({
* renderingTypeDetectionRatio: 0.1,
* async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) {
* // This function is called to extract data from a single web page
* const $prices = await querySelector('span.price')
*
* await pushData({
* url: request.url,
* price: $prices.filter(':contains("$")').first().text(),
* })
*
* await enqueueLinks({ selector: '.pagination a' })
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
*
* @experimental
*/
export declare class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
readonly config: Configuration;
private adaptiveRequestHandler;
private renderingTypePredictor;
private resultChecker;
private resultComparator;
readonly stats: AdaptivePlaywrightCrawlerStatistics;
/**
* Default {@apilink Router} instance that will be used if we don't specify any {@apilink AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
* See {@apilink Router.addHandler|`router.addHandler()`} and {@apilink Router.addDefaultHandler|`router.addDefaultHandler()`}.
*/
// @ts-ignore optional peer dependency or compatibility with es2022
readonly router: RouterHandler<AdaptivePlaywrightCrawlerContext>;
constructor(options?: AdaptivePlaywrightCrawlerOptions, config?: Configuration);
protected _runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise<void>;
protected commitResult(crawlingContext: PlaywrightCrawlingContext, { calls, keyValueStoreChanges }: RequestHandlerResult): Promise<void>;
protected allowStorageAccess<R, TArgs extends any[]>(func: (...args: TArgs) => Promise<R>): (...args: TArgs) => Promise<R>;
protected runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise<Result<RequestHandlerResult>>;
protected runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext, oldStateCopy?: Dictionary): Promise<Result<RequestHandlerResult>>;
private createLogProxy;
}
export declare function createAdaptivePlaywrightRouter<Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): RouterHandler<Context>;
export {};
//# sourceMappingURL=adaptive-playwright-crawler.d.ts.map