@crawlee/http
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
531 lines • 30 kB
TypeScript
import type { IncomingMessage } from 'node:http';
import type { Readable } from 'node:stream';
import type { BasicCrawlerOptions, CrawlingContext, ErrorHandler, GetUserDataFromRequest, ProxyConfiguration, Request, RequestHandler, RouterRoutes, Session } from '@crawlee/basic';
import { BasicCrawler, Configuration, CrawlerExtension } from '@crawlee/basic';
import type { HttpResponse } from '@crawlee/core';
import type { Awaitable, Dictionary } from '@crawlee/types';
import { type CheerioRoot } from '@crawlee/utils';
import type { RequestLike, ResponseLike } from 'content-type';
// @ts-ignore optional peer dependency or compatibility with es2022
import type { Method, OptionsInit } from 'got-scraping';
import { ObjectPredicate } from 'ow';
import type { JsonValue } from 'type-fest';
/**
* TODO exists for BC within HttpCrawler - replace completely with StreamingHttpResponse in 4.0
* @internal
*/
export type PlainResponse = Omit<HttpResponse, 'body'> & IncomingMessage & {
body?: unknown;
};
export type HttpErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends JsonValue = any> = ErrorHandler<HttpCrawlingContext<UserData, JSONData>>;
export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext> extends BasicCrawlerOptions<Context> {
/**
* An alias for {@link HttpCrawlerOptions.requestHandler}
* Soon to be removed, use `requestHandler` instead.
* @deprecated
*/
handlePageFunction?: HttpCrawlerOptions<Context>['requestHandler'];
/**
* Timeout in which the HTTP request to the resource needs to finish, given in seconds.
*/
navigationTimeoutSecs?: number;
/**
* If set to true, SSL certificate errors will be ignored.
*/
ignoreSslErrors?: boolean;
/**
* If set, this crawler will be configured for all connections to use
* [Apify Proxy](https://console.apify.com/proxy) or your own Proxy URLs provided and rotated according to the configuration.
* For more information, see the [documentation](https://docs.apify.com/proxy).
*/
proxyConfiguration?: ProxyConfiguration;
/**
* Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies
* or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotOptions`,
* which are passed to the `requestAsBrowser()` function the crawler calls to navigate.
* Example:
* ```
* preNavigationHooks: [
* async (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* Modyfing `pageOptions` is supported only in Playwright incognito.
* See {@link PrePageCreateHook}
*/
preNavigationHooks?: InternalHttpHook<Context>[];
/**
* Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful.
* The function accepts `crawlingContext` as the only parameter.
* Example:
* ```
* postNavigationHooks: [
* async (crawlingContext) => {
* // ...
* },
* ]
* ```
*/
postNavigationHooks?: InternalHttpHook<Context>[];
/**
* An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types)
* you want the crawler to load and process. By default, only `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
* and `application/json` MIME types are supported.
*/
additionalMimeTypes?: string[];
/**
* By default this crawler will extract correct encoding from the HTTP response headers.
* Sadly, there are some websites which use invalid headers. Those are encoded using the UTF-8 encoding.
* If those sites actually use a different encoding, the response will be corrupted. You can use
* `suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it.
* To force a certain encoding, disregarding the response headers, use {@link HttpCrawlerOptions.forceResponseEncoding}
* ```
* // Will fall back to windows-1250 encoding if none found
* suggestResponseEncoding: 'windows-1250'
* ```
*/
suggestResponseEncoding?: string;
/**
* By default this crawler will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding`
* to force a certain encoding, disregarding the response headers.
* To only provide a default for missing encodings, use {@link HttpCrawlerOptions.suggestResponseEncoding}
* ```
* // Will force windows-1250 encoding even if headers say otherwise
* forceResponseEncoding: 'windows-1250'
* ```
*/
forceResponseEncoding?: string;
/**
* Automatically saves cookies to Session. Works only if Session Pool is used.
*
* It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request.
* It passes the "Cookie" header to the request with the session cookies.
*/
persistCookiesPerSession?: boolean;
/**
* An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
* By default, status codes >= 500 trigger errors.
*/
ignoreHttpErrorStatusCodes?: number[];
/**
* An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
* By default, status codes >= 500 trigger errors.
*/
additionalHttpErrorStatusCodes?: number[];
}
/**
* @internal
*/
export type InternalHttpHook<Context> = (crawlingContext: Context, gotOptions: OptionsInit) => Awaitable<void>;
export type HttpHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends JsonValue = any> = InternalHttpHook<HttpCrawlingContext<UserData, JSONData>>;
/**
* @internal
*/
export interface InternalHttpCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends JsonValue = any, // with default to Dictionary we cant use a typed router in untyped crawler
Crawler = HttpCrawler<any>> extends CrawlingContext<Crawler, UserData> {
/**
* The request body of the web page.
* The type depends on the `Content-Type` header of the web page:
* - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types
* - Buffer for others MIME content types
*/
body: string | Buffer;
/**
* The parsed object from JSON string if the response contains the content type application/json.
*/
json: JSONData;
/**
* Parsed `Content-Type header: { type, encoding }`.
*/
contentType: {
type: string;
encoding: BufferEncoding;
};
response: PlainResponse;
/**
* Wait for an element matching the selector to appear. Timeout is ignored.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;
/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}.
* When provided with the `selector` argument, it will throw if it's not available.
*
* **Example usage:**
* ```ts
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}
export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any> extends InternalHttpCrawlingContext<UserData, JSONData, HttpCrawler<HttpCrawlingContext<UserData, JSONData>>> {
}
export type HttpRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends JsonValue = any> = RequestHandler<HttpCrawlingContext<UserData, JSONData>>;
/**
* Provides a framework for the parallel crawling of web pages using plain HTTP requests.
* The URLs to crawl are fed either from a static list of URLs
* or from a dynamic queue of URLs enabling recursive crawling of websites.
*
* It is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
* because it loads the pages using full-featured headless Chrome browser.
*
* This crawler downloads each URL using a plain HTTP request and doesn't do any HTML parsing.
*
* The source URLs are represented using {@link Request} objects that are fed from
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link HttpCrawlerOptions.requestList}
* or {@link HttpCrawlerOptions.requestQueue} constructor options, respectively.
*
* If both {@link HttpCrawlerOptions.requestList} and {@link HttpCrawlerOptions.requestQueue} are used,
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@link Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```javascript
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
* and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
* and skips pages with other content types. If you want the crawler to process other content types,
* use the {@link HttpCrawlerOptions.additionalMimeTypes} constructor option.
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
* For details, see {@link HttpCrawlerOptions.requestHandler}.
*
* New requests are only dispatched when there is enough free CPU and memory available,
* using the functionality provided by the {@link AutoscaledPool} class.
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
* parameter of the constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
* {@link AutoscaledPool} options are available directly in the constructor.
*
* **Example usage:**
*
* ```javascript
* import { HttpCrawler, Dataset } from '@crawlee/http';
*
* const crawler = new HttpCrawler({
* requestList,
* async requestHandler({ request, response, body, contentType }) {
* // Save the data to dataset.
* await Dataset.pushData({
* url: request.url,
* html: body,
* });
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
* @category Crawlers
*/
export declare class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, HttpCrawler<Context>>> extends BasicCrawler<Context> {
readonly config: Configuration;
/**
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
* Only available if used by the crawler.
*/
proxyConfiguration?: ProxyConfiguration;
protected userRequestHandlerTimeoutMillis: number;
protected preNavigationHooks: InternalHttpHook<Context>[];
protected postNavigationHooks: InternalHttpHook<Context>[];
protected persistCookiesPerSession: boolean;
protected navigationTimeoutMillis: number;
protected ignoreSslErrors: boolean;
protected suggestResponseEncoding?: string;
protected forceResponseEncoding?: string;
protected additionalHttpErrorStatusCodes: Set<number>;
protected ignoreHttpErrorStatusCodes: Set<number>;
protected readonly supportedMimeTypes: Set<string>;
protected static optionsShape: {
// @ts-ignore optional peer dependency or compatibility with es2022
handlePageFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
navigationTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
ignoreSslErrors: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
additionalMimeTypes: import("ow").ArrayPredicate<string>;
// @ts-ignore optional peer dependency or compatibility with es2022
suggestResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
forceResponseEncoding: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
proxyConfiguration: ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
persistCookiesPerSession: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
// @ts-ignore optional peer dependency or compatibility with es2022
ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
// @ts-ignore optional peer dependency or compatibility with es2022
preNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
postNavigationHooks: import("ow").ArrayPredicate<unknown> & import("ow").BasePredicate<unknown[] | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestQueue: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handleRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
requestHandlerTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handleRequestTimeoutSecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
errorHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
failedRequestHandler: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
handleFailedRequestFunction: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxRequestRetries: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
sameDomainDelaySecs: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
// @ts-ignore optional peer dependency or compatibility with es2022
onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
// @ts-ignore optional peer dependency or compatibility with es2022
statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
};
/**
* All `HttpCrawlerOptions` parameters are passed via an options object.
*/
constructor(options?: HttpCrawlerOptions<Context>, config?: Configuration);
/**
* **EXPERIMENTAL**
* Function for attaching CrawlerExtensions such as the Unblockers.
* @param extension Crawler extension that overrides the crawler configuration.
*/
use(extension: CrawlerExtension): void;
/**
* Wrapper around requestHandler that opens and closes pages etc.
*/
protected _runRequestHandler(crawlingContext: Context): Promise<void>;
protected isRequestBlocked(crawlingContext: Context): Promise<string | false>;
protected _handleNavigation(crawlingContext: Context): Promise<void>;
/**
* Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
*/
protected _applyCookies({ session, request }: CrawlingContext, gotOptions: OptionsInit, preHookCookies: string, postHookCookies: string): void;
/**
* Function to make the HTTP request. It performs optimizations
* on the request such as only downloading the request body if the
* received content type matches text/html, application/xml, application/xhtml+xml.
*/
protected _requestFunction({ request, session, proxyUrl, gotOptions, }: RequestFunctionOptions): Promise<PlainResponse>;
/**
* Encodes and parses response according to the provided content type
*/
protected _parseResponse(request: Request, responseStream: IncomingMessage, crawlingContext: Context): Promise<(Partial<Context> & {
isXml: boolean;
response: IncomingMessage;
contentType: {
type: string;
encoding: BufferEncoding;
};
}) | {
body: Buffer<ArrayBufferLike>;
response: IncomingMessage;
contentType: {
type: string;
encoding: BufferEncoding;
};
enqueueLinks: () => Promise<{
processedRequests: never[];
unprocessedRequests: never[];
}>;
}>;
protected _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise<Partial<Context>>;
/**
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
*/
protected _getRequestOptions(request: Request, session?: Session, proxyUrl?: string, gotOptions?: OptionsInit): {
// @ts-ignore optional peer dependency or compatibility with es2022
body?: string | Buffer | Readable | Generator | AsyncGenerator | Iterable<unknown> | AsyncIterable<unknown> | import("got-scraping/node_modules/form-data-encoder", { with: { "resolution-mode": "import" } }).FormDataLike | ArrayBufferView | undefined;
json?: unknown;
// @ts-ignore optional peer dependency or compatibility with es2022
request?: import("got-scraping", { with: { "resolution-mode": "import" } }).RequestFunction | undefined;
url?: string | URL | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
headers?: import("got-scraping", { with: { "resolution-mode": "import" } }).Headers | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
agent?: import("got-scraping", { with: { "resolution-mode": "import" } }).Agents | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
h2session?: import("http2").ClientHttp2Session | undefined;
decompress?: boolean | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
timeout?: import("got-scraping", { with: { "resolution-mode": "import" } }).Delays | undefined;
prefixUrl?: string | URL | undefined;
form?: Record<string, any> | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
cookieJar?: import("got-scraping", { with: { "resolution-mode": "import" } }).PromiseCookieJar | import("got-scraping", { with: { "resolution-mode": "import" } }).ToughCookieJar | undefined;
signal?: AbortSignal | undefined;
ignoreInvalidCookies?: boolean | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
searchParams?: string | import("got-scraping", { with: { "resolution-mode": "import" } }).SearchParameters | URLSearchParams | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
dnsLookup?: import("cacheable-lookup", { with: { "resolution-mode": "import" } }).default["lookup"] | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
dnsCache?: import("cacheable-lookup", { with: { "resolution-mode": "import" } }).default | boolean | undefined;
context?: Record<string, unknown> | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
followRedirect?: boolean | ((response: import("got-scraping", { with: { "resolution-mode": "import" } }).PlainResponse) => boolean) | undefined;
maxRedirects?: number | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
cache?: string | (import("got-scraping/node_modules/keyv", { with: { "resolution-mode": "import" } }).KeyvStoreAdapter | import("got-scraping/node_modules/keyv", { with: { "resolution-mode": "import" } }).Keyv<any> | Map<any, any>) | boolean | undefined;
throwHttpErrors?: boolean | undefined;
username?: string | undefined;
password?: string | undefined;
http2?: boolean | undefined;
allowGetBody?: boolean | undefined;
copyPipedHeaders?: boolean | undefined;
methodRewriting?: boolean | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
dnsLookupIpVersion?: import("got-scraping", { with: { "resolution-mode": "import" } }).DnsLookupIpVersion;
// @ts-ignore optional peer dependency or compatibility with es2022
parseJson?: import("got-scraping", { with: { "resolution-mode": "import" } }).ParseJsonFunction | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
stringifyJson?: import("got-scraping", { with: { "resolution-mode": "import" } }).StringifyJsonFunction | undefined;
localAddress?: string | undefined;
method?: Method | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
createConnection?: import("got-scraping", { with: { "resolution-mode": "import" } }).CreateConnectionFunction | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
cacheOptions?: import("got-scraping", { with: { "resolution-mode": "import" } }).CacheOptions | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
https?: import("got-scraping", { with: { "resolution-mode": "import" } }).HttpsOptions | undefined;
encoding?: BufferEncoding | undefined;
resolveBodyOnly?: boolean | undefined;
isStream?: boolean | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
responseType?: import("got-scraping", { with: { "resolution-mode": "import" } }).ResponseType | undefined;
// @ts-ignore optional peer dependency or compatibility with es2022
pagination?: import("got-scraping", { with: { "resolution-mode": "import" } }).PaginationOptions<unknown, unknown> | undefined;
setHost?: boolean | undefined;
maxHeaderSize?: number | undefined;
enableUnixSockets?: boolean | undefined;
strictContentLength?: boolean | undefined;
} & {
// @ts-ignore optional peer dependency or compatibility with es2022
hooks?: Partial<import("got-scraping", { with: { "resolution-mode": "import" } }).Hooks>;
// @ts-ignore optional peer dependency or compatibility with es2022
retry?: Partial<import("got-scraping", { with: { "resolution-mode": "import" } }).RetryOptions>;
preserveHooks?: boolean;
// @ts-ignore optional peer dependency or compatibility with es2022
} & import("got-scraping", { with: { "resolution-mode": "import" } }).Context & Required<Pick<OptionsInit, "url">> & {
isStream: true;
};
protected _encodeResponse(request: Request, response: IncomingMessage, encoding: BufferEncoding): {
encoding: BufferEncoding;
response: IncomingMessage;
};
/**
* Checks and extends supported mime types
*/
protected _extendSupportedMimeTypes(additionalMimeTypes: (string | RequestLike | ResponseLike)[]): void;
/**
* Handles timeout request
*/
protected _handleRequestTimeout(session?: Session): void;
private _abortDownloadOfBody;
/**
* @internal wraps public utility for mocking purposes
*/
private _requestAsBrowser;
}
interface RequestFunctionOptions {
request: Request;
session?: Session;
proxyUrl?: string;
gotOptions: OptionsInit;
}
/**
* Creates new {@link Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@link HttpCrawler}.
* Defaults to the {@link HttpCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<HttpCrawlingContext>()`.
*
* ```ts
* import { HttpCrawler, createHttpRouter } from 'crawlee';
*
* const router = createHttpRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new HttpCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
// @ts-ignore optional peer dependency or compatibility with es2022
export declare function createHttpRouter<Context extends HttpCrawlingContext = HttpCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/basic").RouterHandler<Context>;
export {};
//# sourceMappingURL=http-crawler.d.ts.map