@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
266 lines • 11.2 kB
TypeScript
import type { BinaryLike } from 'node:crypto';
import type { Dictionary } from '@crawlee/types';
import type { EnqueueLinksOptions } from './enqueue_links/enqueue_links';
import type { SkippedRequestReason } from './enqueue_links/shared';
import type { AllowedHttpMethods } from './typedefs';
export declare enum RequestState {
UNPROCESSED = 0,
BEFORE_NAV = 1,
AFTER_NAV = 2,
REQUEST_HANDLER = 3,
DONE = 4,
ERROR_HANDLER = 5,
ERROR = 6,
SKIPPED = 7
}
/**
* Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata.
* The `Request` object also stores information about errors that occurred during processing of the request.
*
* Each `Request` instance has the `uniqueKey` property, which can be either specified
* manually in the constructor or generated automatically from the URL. Two requests with the same `uniqueKey`
* are considered as pointing to the same web resource. This behavior applies to all Crawlee classes,
* such as {@link RequestList}, {@link RequestQueue}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}.
*
* > To access and examine the actual request sent over http, with all autofilled headers you can access
* `response.request` object from the request handler
*
* Example use:
*
* ```javascript
* const request = new Request({
* url: 'http://www.example.com',
* headers: { Accept: 'application/json' },
* });
*
* ...
*
* request.userData.foo = 'bar';
* request.pushErrorMessage(new Error('Request failed!'));
*
* ...
*
* const foo = request.userData.foo;
* ```
* @category Sources
*/
export declare class Request<UserData extends Dictionary = Dictionary> {
/** Request ID */
id?: string;
/** URL of the web page to crawl. */
url: string;
/**
* An actually loaded URL after redirects, if present. HTTP redirects are guaranteed
* to be included.
*
* When using {@link PuppeteerCrawler} or {@link PlaywrightCrawler}, meta tag and JavaScript redirects may,
* or may not be included, depending on their nature. This generally means that redirects,
* which happen immediately will most likely be included, but delayed redirects will not.
*/
loadedUrl?: string;
/**
* A unique key identifying the request.
* Two requests with the same `uniqueKey` are considered as pointing to the same URL.
*/
uniqueKey: string;
/** HTTP method, e.g. `GET` or `POST`. */
method: AllowedHttpMethods;
/** HTTP request payload, e.g. for POST requests. */
payload?: string;
/** The `true` value indicates that the request will not be automatically retried on error. */
noRetry: boolean;
/** Indicates the number of times the crawling of the request has been retried on error. */
retryCount: number;
/** An array of error messages from request processing. */
errorMessages: string[];
/** Object with HTTP headers. Key is header name, value is the value. */
headers?: Record<string, string>;
/** Private store for the custom user data assigned to the request. */
private _userData;
/** Custom user data assigned to the request. */
userData: UserData;
/**
* ISO datetime string that indicates the time when the request has been processed.
* Is `null` if the request has not been crawled yet.
*/
handledAt?: string;
/**
* `Request` parameters including the URL, HTTP method and headers, and others.
*/
constructor(options: RequestOptions<UserData>);
/** Tells the crawler processing this request to skip the navigation and process the request directly. */
get skipNavigation(): boolean;
/** Tells the crawler processing this request to skip the navigation and process the request directly. */
set skipNavigation(value: boolean);
/**
* Depth of the request in the current crawl tree.
* Note that this is dependent on the crawler setup and might produce unexpected results when used with multiple crawlers.
*/
get crawlDepth(): number;
/**
* Depth of the request in the current crawl tree.
* Note that this is dependent on the crawler setup and might produce unexpected results when used with multiple crawlers.
*/
set crawlDepth(value: number);
/** Indicates the number of times the crawling of the request has rotated the session due to a session or a proxy error. */
get sessionRotationCount(): number;
/** Indicates the number of times the crawling of the request has rotated the session due to a session or a proxy error. */
set sessionRotationCount(value: number);
/** shortcut for getting `request.userData.label` */
get label(): string | undefined;
/** shortcut for setting `request.userData.label` */
set label(value: string | undefined);
/** Maximum number of retries for this request. Allows to override the global `maxRequestRetries` option of `BasicCrawler`. */
get maxRetries(): number | undefined;
/** Maximum number of retries for this request. Allows to override the global `maxRequestRetries` option of `BasicCrawler`. */
set maxRetries(value: number | undefined);
/** Describes the request's current lifecycle state. */
get state(): RequestState;
/** Describes the request's current lifecycle state. */
set state(value: RequestState);
private get enqueueStrategy();
private set enqueueStrategy(value);
/**
* Stores information about an error that occurred during processing of this request.
*
* You should always use Error instances when throwing errors in JavaScript.
*
* Nevertheless, to improve the debugging experience when using third party libraries
* that may not always throw an Error instance, the function performs a type
* inspection of the passed argument and attempts to extract as much information
* as possible, since just throwing a bad type error makes any debugging rather difficult.
*
* @param errorOrMessage Error object or error message to be stored in the request.
* @param [options]
*/
pushErrorMessage(errorOrMessage: unknown, options?: PushErrorMessageOptions): void;
protected _computeUniqueKey(options: ComputeUniqueKeyOptions): string;
protected _hashPayload(payload: BinaryLike): string;
/** @internal */
static computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey, }: ComputeUniqueKeyOptions): string;
/** @internal */
static hashPayload(payload: BinaryLike): string;
}
/**
* Specifies required and optional fields for constructing a {@link Request}.
*/
export interface RequestOptions<UserData extends Dictionary = Dictionary> {
/** URL of the web page to crawl. It must be a non-empty string. */
url: string;
/**
* A unique key identifying the request.
* Two requests with the same `uniqueKey` are considered as pointing to the same URL.
*
* If `uniqueKey` is not provided, then it is automatically generated by normalizing the URL.
* For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `uniqueKey`
* of `http://www.example.com/something`.
*
* The `keepUrlFragment` option determines whether URL hash fragment is included in the `uniqueKey` or not.
*
* The `useExtendedUniqueKey` options determines whether method and payload are included in the `uniqueKey`,
* producing a `uniqueKey` in the following format: `METHOD(payloadHash):normalizedUrl`. This is useful
* when requests point to the same URL, but with different methods and payloads. For example: form submits.
*
* Pass an arbitrary non-empty text value to the `uniqueKey` property
* to override the default behavior and specify which URLs shall be considered equal.
*/
uniqueKey?: string;
/** @default 'GET' */
method?: AllowedHttpMethods | Lowercase<AllowedHttpMethods>;
/** HTTP request payload, e.g. for POST requests. */
payload?: string;
/**
* HTTP headers in the following format:
* ```
* {
* Accept: 'text/html',
* 'Content-Type': 'application/json'
* }
* ```
*/
headers?: Record<string, string>;
/**
* Custom user data assigned to the request. Use this to save any request related data to the
* request's scope, keeping them accessible on retries, failures etc.
*/
userData?: UserData;
/**
* Shortcut for setting `userData: { label: '...' }`.
*/
label?: string;
/**
* If `false` then the hash part of a URL is removed when computing the `uniqueKey` property.
* For example, this causes the `http://www.example.com#foo` and `http://www.example.com#bar` URLs
* to have the same `uniqueKey` of `http://www.example.com` and thus the URLs are considered equal.
* Note that this option only has an effect if `uniqueKey` is not set.
* @default false
*/
keepUrlFragment?: boolean;
/**
* If `true` then the `uniqueKey` is computed not only from the URL, but also from the method and payload
* properties. This is useful when making requests to the same URL that are differentiated by method
* or payload, such as form submit navigations in browsers.
* @default false
*/
useExtendedUniqueKey?: boolean;
/**
* The `true` value indicates that the request will not be automatically retried on error.
* @default false
*/
noRetry?: boolean;
/**
* If set to `true` then the crawler processing this request evaluates
* the `requestHandler` immediately without prior browser navigation.
* @default false
*/
skipNavigation?: boolean;
/**
* Depth of the request in the current crawl tree.
* Note that this is dependent on the crawler setup and might produce unexpected results when used with multiple crawlers.
* @default 0
*/
crawlDepth?: number;
/**
* Reason for skipping this request.
* This is used to provide more information about why the request was skipped.
* @internal
*/
skippedReason?: SkippedRequestReason;
/**
* Maximum number of retries for this request. Allows to override the global `maxRequestRetries` option of `BasicCrawler`.
*/
maxRetries?: number;
/** @internal */
id?: string;
/** @internal */
handledAt?: string;
/** @internal */
lockExpiresAt?: Date;
/** @internal */
enqueueStrategy?: EnqueueLinksOptions['strategy'];
}
export interface PushErrorMessageOptions {
/**
* Only push the error message without stack trace when true.
* @default false
*/
omitStack?: boolean;
}
interface ComputeUniqueKeyOptions {
url: string;
method: AllowedHttpMethods;
payload?: string | Buffer;
keepUrlFragment?: boolean;
useExtendedUniqueKey?: boolean;
}
export type Source = (Partial<RequestOptions> & {
requestsFromUrl?: string;
regex?: RegExp;
}) | Request;
/** @internal */
export interface InternalSource {
requestsFromUrl: string;
regex?: RegExp;
}
export {};
//# sourceMappingURL=request.d.ts.map