@crawlee/http

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

crawlee.dev

apify/crawlee

531 lines • 30 kB

TypeScript

import type { IncomingMessage } from 'node:http'; import type { Readable } from 'node:stream'; import type { BasicCrawlerOptions, CrawlingContext, ErrorHandler, GetUserDataFromRequest, ProxyConfiguration, Request, RequestHandler, RouterRoutes, Session } from '@crawlee/basic'; import { BasicCrawler, Configuration, CrawlerExtension } from '@crawlee/basic'; import type { HttpResponse } from '@crawlee/core'; import type { Awaitable, Dictionary } from '@crawlee/types'; import { type CheerioRoot } from '@crawlee/utils'; import type { RequestLike, ResponseLike } from 'content-type'; // @ts-ignore optional peer dependency or compatibility with es2022 import type { Method, OptionsInit } from 'got-scraping'; import { ObjectPredicate } from 'ow'; import type { JsonValue } from 'type-fest'; /** * TODO exists for BC within HttpCrawler - replace completely with StreamingHttpResponse in 4.0 * @internal */ export type PlainResponse = Omit<HttpResponse, 'body'> & IncomingMessage & { body?: unknown; }; export type HttpErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends JsonValue = any> = ErrorHandler<HttpCrawlingContext<UserData, JSONData>>; export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext> extends BasicCrawlerOptions<Context> { /** * An alias for {@link HttpCrawlerOptions.requestHandler} * Soon to be removed, use `requestHandler` instead. * @deprecated */ handlePageFunction?: HttpCrawlerOptions<Context>['requestHandler']; /** * Timeout in which the HTTP request to the resource needs to finish, given in seconds. */ navigationTimeoutSecs?: number; /** * If set to true, SSL certificate errors will be ignored. */ ignoreSslErrors?: boolean; /** * If set, this crawler will be configured for all connections to use * [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration. * For more information, see the [documentation](https://docs.apify.com/proxy). */ proxyConfiguration?: ProxyConfiguration; /** * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies * or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotOptions`, * which are passed to the `requestAsBrowser()` function the crawler calls to navigate. * Example: * ``` * preNavigationHooks: [ * async (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * Modyfing `pageOptions` is supported only in Playwright incognito. * See {@link PrePageCreateHook} */ preNavigationHooks?: InternalHttpHook<Context>[]; /** * Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. * The function accepts `crawlingContext` as the only parameter. * Example: * ``` * postNavigationHooks: [ * async (crawlingContext) => { * // ... * }, * ] * ``` */ postNavigationHooks?: InternalHttpHook<Context>[]; /** * An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types) * you want the crawler to load and process. By default, only `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`, * and `application/json` MIME types are supported. */ additionalMimeTypes?: string[]; /** * By default this crawler will extract correct encoding from the HTTP response headers. * Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding. * If those sites actually use a different encoding, the response will be corrupted. You can use * `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it. * To force a certain encoding, disregarding the response headers, use {@link HttpCrawlerOptions.forceResponseEncoding} * ``` * // Will fall back to windows-1250 encoding if none found * suggestResponseEncoding: 'windows-1250' * ``` */ suggestResponseEncoding?: string; /** * By default this crawler will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding` * to force a certain encoding, disregarding the response headers. * To only provide a default for missing encodings, use {@link HttpCrawlerOptions.suggestResponseEncoding} * ``` * // Will force windows-1250 encoding even if headers say otherwise * forceResponseEncoding: 'windows-1250' * ``` */ forceResponseEncoding?: string; /** * Automatically saves cookies to Session. Works only if Session Pool is used. * * It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request. * It passes the "Cookie" header to the request with the session cookies. */ persistCookiesPerSession?: boolean; /** * An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration. * By default, status codes >= 500 trigger errors. */ ignoreHttpErrorStatusCodes?: number[]; /** * An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors. * By default, status codes >= 500 trigger errors. */ additionalHttpErrorStatusCodes?: number[]; } /** * @internal */ export type InternalHttpHook<Context> = (crawlingContext: Context, gotOptions: OptionsInit) => Awaitable<void>; export type HttpHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends JsonValue = any> = InternalHttpHook<HttpCrawlingContext<UserData, JSONData>>; /** * @internal */ export interface InternalHttpCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends JsonValue = any, // with default to Dictionary we cant use a typed router in untyped crawler Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> { /** * The request body of the web page. * The type depends on the `Content-Type` header of the web page: * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types * - Buffer for others MIME content types */ body: string | Buffer; /** * The parsed object from JSON string if the response contains the content type application/json. */ json: JSONData; /** * Parsed `Content-Type header: { type, encoding }`. */ contentType: { type: string; encoding: BufferEncoding; }; response: PlainResponse; /** * Wait for an element matching the selector to appear. Timeout is ignored. * * **Example usage:** * ```ts * async requestHandler({ waitForSelector, parseWithCheerio }) { * await waitForSelector('article h1'); * const $ = await parseWithCheerio(); * const title = $('title').text(); * }); * ``` */ waitForSelector(selector: string, timeoutMs?: number): Promise<void>; /** * Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}. * When provided with the `selector` argument, it will throw if it's not available. * * **Example usage:** * ```ts * async requestHandler({ parseWithCheerio }) { * const $ = await parseWithCheerio(); * const title = $('title').text(); * }); * ``` */ parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>; } export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData, HttpCrawler<HttpCrawlingContext<UserData, JSONData>>> { } export type HttpRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData, JSONData>>; /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests. * The URLs to crawl are fed either from a static list of URLs * or from a dynamic queue of URLs enabling recursive crawling of websites. * * It is very fast and efficient on data bandwidth. However, if the target website requires JavaScript * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead, * because it loads the pages using full-featured headless Chrome browser. * * This crawler downloads each URL using a plain HTTP request and doesn't do any HTML parsing. * * The source URLs are represented using {@link Request} objects that are fed from * {@link RequestList} or {@link RequestQueue} instances provided by the {@link HttpCrawlerOptions.requestList} * or {@link HttpCrawlerOptions.requestQueue} constructor options, respectively. * * If both {@link HttpCrawlerOptions.requestList} and {@link HttpCrawlerOptions.requestQueue} are used, * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * We can use the `preNavigationHooks` to adjust `gotOptions`: * * ```javascript * preNavigationHooks: [ * (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`, * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header), * and skips pages with other content types. If you want the crawler to process other content types, * use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option. * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content. * For details, see {@link HttpCrawlerOptions.requestHandler}. * * New requests are only dispatched when there is enough free CPU and memory available, * using the functionality provided by the {@link AutoscaledPool} class. * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` * parameter of the constructor. For user convenience, the `minConcurrency` and `maxConcurrency` * {@link AutoscaledPool} options are available directly in the constructor. * * **Example usage:** * * ```javascript * import { HttpCrawler, Dataset } from '@crawlee/http'; * * const crawler = new HttpCrawler({ * requestList, * async requestHandler({ request, response, body, contentType }) { * // Save the data to dataset. * await Dataset.pushData({ * url: request.url, * html: body, * }); * }, * }); * * await crawler.run([ * 'http://www.example.com/page-1', * 'http://www.example.com/page-2', * ]); * ``` * @category Crawlers */ export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, HttpCrawler<Context>>> extends BasicCrawler<Context> { readonly config: Configuration; /** * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies. * Only available if used by the crawler. */ proxyConfiguration?: ProxyConfiguration; protected userRequestHandlerTimeoutMillis: number; protected preNavigationHooks: InternalHttpHook<Context>[]; protected postNavigationHooks: InternalHttpHook<Context>[]; protected persistCookiesPerSession: boolean; protected navigationTimeoutMillis: number; protected ignoreSslErrors: boolean; protected suggestResponseEncoding?: string; protected forceResponseEncoding?: string; protected additionalHttpErrorStatusCodes: Set<number>; protected ignoreHttpErrorStatusCodes: Set<number>; protected readonly supportedMimeTypes: Set<string>; protected static optionsShape: { // @ts-ignore optional peer dependency or compatibility with es2022 handlePageFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 ignoreSslErrors: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 additionalMimeTypes: import("ow").ArrayPredicate<string>; // @ts-ignore optional peer dependency or compatibility with es2022 suggestResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 forceResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 proxyConfiguration: ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>; // @ts-ignore optional peer dependency or compatibility with es2022 ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>; // @ts-ignore optional peer dependency or compatibility with es2022 preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestQueue: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 requestHandlerTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleRequestTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 errorHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 failedRequestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 handleFailedRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>; // @ts-ignore optional peer dependency or compatibility with es2022 onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; // @ts-ignore optional peer dependency or compatibility with es2022 statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>; }; /** * All `HttpCrawlerOptions` parameters are passed via an options object. */ constructor(options?: HttpCrawlerOptions<Context>, config?: Configuration); /** * **EXPERIMENTAL** * Function for attaching CrawlerExtensions such as the Unblockers. * @param extension Crawler extension that overrides the crawler configuration. */ use(extension: CrawlerExtension): void; /** * Wrapper around requestHandler that opens and closes pages etc. */ protected _runRequestHandler(crawlingContext: Context): Promise<void>; protected isRequestBlocked(crawlingContext: Context): Promise<string | false>; protected _handleNavigation(crawlingContext: Context): Promise<void>; /** * Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks. */ protected _applyCookies({ session, request }: CrawlingContext, gotOptions: OptionsInit, preHookCookies: string, postHookCookies: string): void; /** * Function to make the HTTP request. It performs optimizations * on the request such as only downloading the request body if the * received content type matches text/html, application/xml, application/xhtml+xml. */ protected _requestFunction({ request, session, proxyUrl, gotOptions, }: RequestFunctionOptions): Promise<PlainResponse>; /** * Encodes and parses response according to the provided content type */ protected _parseResponse(request: Request, responseStream: IncomingMessage, crawlingContext: Context): Promise<(Partial<Context> & { isXml: boolean; response: IncomingMessage; contentType: { type: string; encoding: BufferEncoding; }; }) | { body: Buffer<ArrayBufferLike>; response: IncomingMessage; contentType: { type: string; encoding: BufferEncoding; }; enqueueLinks: () => Promise<{ processedRequests: never[]; unprocessedRequests: never[]; }>; }>; protected _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise<Partial<Context>>; /** * Combines the provided `requestOptions` with mandatory (non-overridable) values. */ protected _getRequestOptions(request: Request, session?: Session, proxyUrl?: string, gotOptions?: OptionsInit): { // @ts-ignore optional peer dependency or compatibility with es2022 body?: string | Buffer | Readable | Generator | AsyncGenerator | Iterable<unknown> | AsyncIterable<unknown> | import("got-scraping/node_modules/form-data-encoder", { with: { "resolution-mode": "import" } }).FormDataLike | ArrayBufferView | undefined; json?: unknown; // @ts-ignore optional peer dependency or compatibility with es2022 request?: import("got-scraping", { with: { "resolution-mode": "import" } }).RequestFunction | undefined; url?: string | URL | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 headers?: import("got-scraping", { with: { "resolution-mode": "import" } }).Headers | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 agent?: import("got-scraping", { with: { "resolution-mode": "import" } }).Agents | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 h2session?: import("http2").ClientHttp2Session | undefined; decompress?: boolean | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 timeout?: import("got-scraping", { with: { "resolution-mode": "import" } }).Delays | undefined; prefixUrl?: string | URL | undefined; form?: Record<string, any> | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 cookieJar?: import("got-scraping", { with: { "resolution-mode": "import" } }).PromiseCookieJar | import("got-scraping", { with: { "resolution-mode": "import" } }).ToughCookieJar | undefined; signal?: AbortSignal | undefined; ignoreInvalidCookies?: boolean | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 searchParams?: string | import("got-scraping", { with: { "resolution-mode": "import" } }).SearchParameters | URLSearchParams | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 dnsLookup?: import("cacheable-lookup", { with: { "resolution-mode": "import" } }).default["lookup"] | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 dnsCache?: import("cacheable-lookup", { with: { "resolution-mode": "import" } }).default | boolean | undefined; context?: Record<string, unknown> | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 followRedirect?: boolean | ((response: import("got-scraping", { with: { "resolution-mode": "import" } }).PlainResponse) => boolean) | undefined; maxRedirects?: number | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 cache?: string | (import("got-scraping/node_modules/keyv", { with: { "resolution-mode": "import" } }).KeyvStoreAdapter | import("got-scraping/node_modules/keyv", { with: { "resolution-mode": "import" } }).Keyv<any> | Map<any, any>) | boolean | undefined; throwHttpErrors?: boolean | undefined; username?: string | undefined; password?: string | undefined; http2?: boolean | undefined; allowGetBody?: boolean | undefined; copyPipedHeaders?: boolean | undefined; methodRewriting?: boolean | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 dnsLookupIpVersion?: import("got-scraping", { with: { "resolution-mode": "import" } }).DnsLookupIpVersion; // @ts-ignore optional peer dependency or compatibility with es2022 parseJson?: import("got-scraping", { with: { "resolution-mode": "import" } }).ParseJsonFunction | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 stringifyJson?: import("got-scraping", { with: { "resolution-mode": "import" } }).StringifyJsonFunction | undefined; localAddress?: string | undefined; method?: Method | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 createConnection?: import("got-scraping", { with: { "resolution-mode": "import" } }).CreateConnectionFunction | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 cacheOptions?: import("got-scraping", { with: { "resolution-mode": "import" } }).CacheOptions | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 https?: import("got-scraping", { with: { "resolution-mode": "import" } }).HttpsOptions | undefined; encoding?: BufferEncoding | undefined; resolveBodyOnly?: boolean | undefined; isStream?: boolean | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 responseType?: import("got-scraping", { with: { "resolution-mode": "import" } }).ResponseType | undefined; // @ts-ignore optional peer dependency or compatibility with es2022 pagination?: import("got-scraping", { with: { "resolution-mode": "import" } }).PaginationOptions<unknown, unknown> | undefined; setHost?: boolean | undefined; maxHeaderSize?: number | undefined; enableUnixSockets?: boolean | undefined; strictContentLength?: boolean | undefined; } & { // @ts-ignore optional peer dependency or compatibility with es2022 hooks?: Partial<import("got-scraping", { with: { "resolution-mode": "import" } }).Hooks>; // @ts-ignore optional peer dependency or compatibility with es2022 retry?: Partial<import("got-scraping", { with: { "resolution-mode": "import" } }).RetryOptions>; preserveHooks?: boolean; // @ts-ignore optional peer dependency or compatibility with es2022 } & import("got-scraping", { with: { "resolution-mode": "import" } }).Context & Required<Pick<OptionsInit, "url">> & { isStream: true; }; protected _encodeResponse(request: Request, response: IncomingMessage, encoding: BufferEncoding): { encoding: BufferEncoding; response: IncomingMessage; }; /** * Checks and extends supported mime types */ protected _extendSupportedMimeTypes(additionalMimeTypes: (string | RequestLike | ResponseLike)[]): void; /** * Handles timeout request */ protected _handleRequestTimeout(session?: Session): void; private _abortDownloadOfBody; /** * @internal wraps public utility for mocking purposes */ private _requestAsBrowser; } interface RequestFunctionOptions { request: Request; session?: Session; proxyUrl?: string; gotOptions: OptionsInit; } /** * Creates new {@link Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@link HttpCrawler}. * Defaults to the {@link HttpCrawlingContext}. * * > Serves as a shortcut for using `Router.create<HttpCrawlingContext>()`. * * ```ts * import { HttpCrawler, createHttpRouter } from 'crawlee'; * * const router = createHttpRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new HttpCrawler({ * requestHandler: router, * }); * await crawler.run(); * ``` */ // @ts-ignore optional peer dependency or compatibility with es2022 export declare function createHttpRouter<Context extends HttpCrawlingContext = HttpCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/basic").RouterHandler<Context>; export {}; //# sourceMappingURL=http-crawler.d.ts.map