@crawlee/http
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
728 lines • 33 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.HttpCrawler = void 0;
exports.createHttpRouter = createHttpRouter;
const tslib_1 = require("tslib");
const node_path_1 = require("node:path");
const node_util_1 = tslib_1.__importDefault(require("node:util"));
const basic_1 = require("@crawlee/basic");
const utils_1 = require("@crawlee/utils");
const cheerio = tslib_1.__importStar(require("cheerio"));
const content_type_1 = tslib_1.__importDefault(require("content-type"));
const iconv_lite_1 = tslib_1.__importDefault(require("iconv-lite"));
const mime_types_1 = tslib_1.__importDefault(require("mime-types"));
const ow_1 = tslib_1.__importStar(require("ow"));
const timeout_1 = require("@apify/timeout");
const utilities_1 = require("@apify/utilities");
let TimeoutError;
/**
* Default mime types, which HttpScraper supports.
*/
const HTML_AND_XML_MIME_TYPES = ['text/html', 'text/xml', 'application/xhtml+xml', 'application/xml'];
const APPLICATION_JSON_MIME_TYPE = 'application/json';
const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS = {
desiredConcurrency: 10,
snapshotterOptions: {
eventLoopSnapshotIntervalSecs: 2,
maxBlockedMillis: 100,
},
systemStatusOptions: {
maxEventLoopOverloadedRatio: 0.7,
},
};
/**
* Provides a framework for the parallel crawling of web pages using plain HTTP requests.
* The URLs to crawl are fed either from a static list of URLs
* or from a dynamic queue of URLs enabling recursive crawling of websites.
*
* It is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
* to display the content, you might need to use {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler} instead,
* because it loads the pages using full-featured headless Chrome browser.
*
* This crawler downloads each URL using a plain HTTP request and doesn't do any HTML parsing.
*
* The source URLs are represented using {@apilink Request} objects that are fed from
* {@apilink RequestList} or {@apilink RequestQueue} instances provided by the {@apilink HttpCrawlerOptions.requestList}
* or {@apilink HttpCrawlerOptions.requestQueue} constructor options, respectively.
*
* If both {@apilink HttpCrawlerOptions.requestList} and {@apilink HttpCrawlerOptions.requestQueue} are used,
* the instance first processes URLs from the {@apilink RequestList} and automatically enqueues all of them
* to {@apilink RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@apilink Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```javascript
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* By default, this crawler only processes web pages with the `text/html`
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
* and skips pages with other content types. If you want the crawler to process other content types,
* use the {@apilink HttpCrawlerOptions.additionalMimeTypes} constructor option.
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
* For details, see {@apilink HttpCrawlerOptions.requestHandler}.
*
* New requests are only dispatched when there is enough free CPU and memory available,
* using the functionality provided by the {@apilink AutoscaledPool} class.
* All {@apilink AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
* parameter of the constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
* {@apilink AutoscaledPool} options are available directly in the constructor.
*
* **Example usage:**
*
* ```javascript
* import { HttpCrawler, Dataset } from '@crawlee/http';
*
* const crawler = new HttpCrawler({
* requestList,
* async requestHandler({ request, response, body, contentType }) {
* // Save the data to dataset.
* await Dataset.pushData({
* url: request.url,
* html: body,
* });
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
* @category Crawlers
*/
class HttpCrawler extends basic_1.BasicCrawler {
/**
* All `HttpCrawlerOptions` parameters are passed via an options object.
*/
constructor(options = {}, config = basic_1.Configuration.getGlobalConfig()) {
(0, ow_1.default)(options, 'HttpCrawlerOptions', ow_1.default.object.exactShape(HttpCrawler.optionsShape));
const { requestHandler, handlePageFunction, requestHandlerTimeoutSecs = 60, navigationTimeoutSecs = 30, ignoreSslErrors = true, additionalMimeTypes = [], suggestResponseEncoding, forceResponseEncoding, proxyConfiguration, persistCookiesPerSession, preNavigationHooks = [], postNavigationHooks = [], additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
// Ignored
handleRequestFunction,
// BasicCrawler
autoscaledPoolOptions = HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS, ...basicCrawlerOptions } = options;
super({
...basicCrawlerOptions,
requestHandler,
autoscaledPoolOptions,
// We need to add some time for internal functions to finish,
// but not too much so that we would stall the crawler.
requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + basic_1.BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
}, config);
Object.defineProperty(this, "config", {
enumerable: true,
configurable: true,
writable: true,
value: config
});
/**
* A reference to the underlying {@apilink ProxyConfiguration} class that manages the crawler's proxies.
* Only available if used by the crawler.
*/
Object.defineProperty(this, "proxyConfiguration", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "userRequestHandlerTimeoutMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "preNavigationHooks", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "postNavigationHooks", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "persistCookiesPerSession", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "navigationTimeoutMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "ignoreSslErrors", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "suggestResponseEncoding", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "forceResponseEncoding", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "additionalHttpErrorStatusCodes", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "ignoreHttpErrorStatusCodes", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "supportedMimeTypes", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* @internal wraps public utility for mocking purposes
*/
Object.defineProperty(this, "_requestAsBrowser", {
enumerable: true,
configurable: true,
writable: true,
value: async (options, session) => {
const response = await this.httpClient.stream((0, basic_1.processHttpRequestOptions)({
...options,
cookieJar: options.cookieJar, // HACK - the type of ToughCookieJar in got is wrong
responseType: 'text',
}), (redirectResponse, updatedRequest) => {
if (this.persistCookiesPerSession) {
session.setCookiesFromResponse(redirectResponse);
const cookieString = session.getCookieString(updatedRequest.url.toString());
if (cookieString !== '') {
updatedRequest.headers.Cookie = cookieString;
}
}
});
return addResponsePropertiesToStream(response.stream, response);
}
});
this._handlePropertyNameChange({
newName: 'requestHandler',
oldName: 'handlePageFunction',
propertyKey: 'requestHandler',
newProperty: requestHandler,
oldProperty: handlePageFunction,
allowUndefined: true,
});
if (!this.requestHandler) {
this.requestHandler = this.router;
}
// Cookies should be persisted per session only if session pool is used
if (!this.useSessionPool && persistCookiesPerSession) {
throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.');
}
this.supportedMimeTypes = new Set([...HTML_AND_XML_MIME_TYPES, APPLICATION_JSON_MIME_TYPE]);
if (additionalMimeTypes.length)
this._extendSupportedMimeTypes(additionalMimeTypes);
if (suggestResponseEncoding && forceResponseEncoding) {
this.log.warning('Both forceResponseEncoding and suggestResponseEncoding options are set. Using forceResponseEncoding.');
}
this.userRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
this.navigationTimeoutMillis = navigationTimeoutSecs * 1000;
this.ignoreSslErrors = ignoreSslErrors;
this.suggestResponseEncoding = suggestResponseEncoding;
this.forceResponseEncoding = forceResponseEncoding;
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
this.proxyConfiguration = proxyConfiguration;
this.preNavigationHooks = preNavigationHooks;
this.postNavigationHooks = [
({ request, response }) => this._abortDownloadOfBody(request, response),
...postNavigationHooks,
];
if (this.useSessionPool) {
this.persistCookiesPerSession = persistCookiesPerSession ?? true;
}
else {
this.persistCookiesPerSession = false;
}
}
/**
* **EXPERIMENTAL**
* Function for attaching CrawlerExtensions such as the Unblockers.
* @param extension Crawler extension that overrides the crawler configuration.
*/
use(extension) {
(0, ow_1.default)(extension, ow_1.default.object.instanceOf(basic_1.CrawlerExtension));
const className = this.constructor.name;
const extensionOptions = extension.getCrawlerOptions();
for (const [key, value] of Object.entries(extensionOptions)) {
const isConfigurable = Object.hasOwn(this, key);
const originalType = typeof this[key];
const extensionType = typeof value; // What if we want to null something? It is really needed?
const isSameType = originalType === extensionType || value == null; // fast track for deleting keys
const exists = this[key] != null;
if (!isConfigurable) {
// Test if the property can be configured on the crawler
throw new Error(`${extension.name} tries to set property "${key}" that is not configurable on ${className} instance.`);
}
if (!isSameType && exists) {
// Assuming that extensions will only add up configuration
throw new Error(`${extension.name} tries to set property of different type "${extensionType}". "${className}.${key}: ${originalType}".`);
}
this.log.warning(`${extension.name} is overriding "${className}.${key}: ${originalType}" with ${value}.`);
this[key] = value;
}
}
/**
* Wrapper around requestHandler that opens and closes pages etc.
*/
async _runRequestHandler(crawlingContext) {
const { request, session } = crawlingContext;
if (this.proxyConfiguration) {
const sessionId = session ? session.id : undefined;
crawlingContext.proxyInfo = await this.proxyConfiguration.newProxyInfo(sessionId, { request });
}
if (!request.skipNavigation) {
await this._handleNavigation(crawlingContext);
(0, timeout_1.tryCancel)();
const parsed = await this._parseResponse(request, crawlingContext.response, crawlingContext);
const response = parsed.response;
const contentType = parsed.contentType;
(0, timeout_1.tryCancel)();
// `??=` because descendant classes may already set optimized version
crawlingContext.waitForSelector ?? (crawlingContext.waitForSelector = async (selector, _timeoutMs) => {
const $ = cheerio.load(parsed.body.toString());
if ($(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}
});
crawlingContext.parseWithCheerio ?? (crawlingContext.parseWithCheerio = async (selector, timeoutMs) => {
const $ = cheerio.load(parsed.body.toString());
if (selector) {
await crawlingContext.waitForSelector(selector, timeoutMs);
}
return $;
});
if (this.useSessionPool) {
this._throwOnBlockedRequest(crawlingContext.session, response.statusCode);
}
if (this.persistCookiesPerSession) {
crawlingContext.session.setCookiesFromResponse(response);
}
request.loadedUrl = response.url;
if (!this.requestMatchesEnqueueStrategy(request)) {
this.log.debug(
// eslint-disable-next-line dot-notation
`Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
request.noRetry = true;
request.state = basic_1.RequestState.SKIPPED;
return;
}
Object.assign(crawlingContext, parsed);
Object.defineProperty(crawlingContext, 'json', {
get() {
if (contentType.type !== APPLICATION_JSON_MIME_TYPE)
return null;
const jsonString = parsed.body.toString(contentType.encoding);
return JSON.parse(jsonString);
},
});
}
if (this.retryOnBlocked) {
const error = await this.isRequestBlocked(crawlingContext);
if (error)
throw new basic_1.SessionError(error);
}
request.state = basic_1.RequestState.REQUEST_HANDLER;
try {
await (0, timeout_1.addTimeoutToPromise)(async () => Promise.resolve(this.requestHandler(crawlingContext)), this.userRequestHandlerTimeoutMillis, `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`);
request.state = basic_1.RequestState.DONE;
}
catch (e) {
request.state = basic_1.RequestState.ERROR;
throw e;
}
}
async isRequestBlocked(crawlingContext) {
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
const $ = await crawlingContext.parseWithCheerio();
const foundSelectors = utils_1.RETRY_CSS_SELECTORS.filter((selector) => $(selector).length > 0);
if (foundSelectors.length > 0) {
return `Found selectors: ${foundSelectors.join(', ')}`;
}
}
return false;
}
async _handleNavigation(crawlingContext) {
const gotOptions = {};
const { request, session } = crawlingContext;
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
request.state = basic_1.RequestState.BEFORE_NAV;
// Execute pre navigation hooks before applying session pool cookies,
// as they may also set cookies in the session
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
(0, timeout_1.tryCancel)();
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);
this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);
const proxyUrl = crawlingContext.proxyInfo?.url;
crawlingContext.response = await (0, timeout_1.addTimeoutToPromise)(async () => this._requestFunction({ request, session, proxyUrl, gotOptions }), this.navigationTimeoutMillis, `request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`);
(0, timeout_1.tryCancel)();
request.state = basic_1.RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
(0, timeout_1.tryCancel)();
}
/**
* Sets the cookie header to `gotOptions` based on the provided request and session headers, as well as any changes that occurred due to hooks.
*/
_applyCookies({ session, request }, gotOptions, preHookCookies, postHookCookies) {
const sessionCookie = session?.getCookieString(request.url) ?? '';
let alteredGotOptionsCookies = gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '';
if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
const { Cookie: upperCaseHeader, cookie: lowerCaseHeader } = gotOptions.headers;
this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);
const sourceCookies = [];
if (Array.isArray(lowerCaseHeader)) {
sourceCookies.push(...lowerCaseHeader);
}
else {
sourceCookies.push(lowerCaseHeader);
}
if (Array.isArray(upperCaseHeader)) {
sourceCookies.push(...upperCaseHeader);
}
else {
sourceCookies.push(upperCaseHeader);
}
alteredGotOptionsCookies = (0, basic_1.mergeCookies)(request.url, sourceCookies);
}
const sourceCookies = [sessionCookie, preHookCookies];
if (Array.isArray(alteredGotOptionsCookies)) {
sourceCookies.push(...alteredGotOptionsCookies);
}
else {
sourceCookies.push(alteredGotOptionsCookies);
}
sourceCookies.push(postHookCookies);
const mergedCookie = (0, basic_1.mergeCookies)(request.url, sourceCookies);
gotOptions.headers ?? (gotOptions.headers = {});
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
Reflect.deleteProperty(gotOptions.headers, 'cookie');
if (mergedCookie !== '') {
gotOptions.headers.Cookie = mergedCookie;
}
}
/**
* Function to make the HTTP request. It performs optimizations
* on the request such as only downloading the request body if the
* received content type matches text/html, application/xml, application/xhtml+xml.
*/
async _requestFunction({ request, session, proxyUrl, gotOptions, }) {
if (!TimeoutError) {
// @ts-ignore
({ TimeoutError } = await import('got-scraping'));
}
const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
try {
return await this._requestAsBrowser(opts, session);
}
catch (e) {
if (e instanceof TimeoutError) {
this._handleRequestTimeout(session);
return undefined;
}
if (this.isProxyError(e)) {
throw new basic_1.SessionError(this._getMessageFromError(e));
}
else {
throw e;
}
}
}
/**
* Encodes and parses response according to the provided content type
*/
async _parseResponse(request, responseStream, crawlingContext) {
const { statusCode } = responseStream;
const { type, charset } = parseContentTypeFromResponse(responseStream);
const { response, encoding } = this._encodeResponse(request, responseStream, charset);
const contentType = { type, encoding };
if (statusCode >= 400 && statusCode <= 599) {
this.stats.registerStatusCode(statusCode);
}
const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode);
const includeError = this.additionalHttpErrorStatusCodes.has(statusCode);
if ((statusCode >= 500 && !excludeError) || includeError) {
const body = await (0, utilities_1.readStreamToString)(response, encoding);
// Errors are often sent as JSON, so attempt to parse them,
// despite Accept header being set to text/html.
if (type === APPLICATION_JSON_MIME_TYPE) {
const errorResponse = JSON.parse(body);
let { message } = errorResponse;
if (!message)
message = node_util_1.default.inspect(errorResponse, { depth: 1, maxArrayLength: 10 });
throw new Error(`${statusCode} - ${message}`);
}
if (includeError) {
throw new Error(`${statusCode} - Error status code was set by user.`);
}
// It's not a JSON, so it's probably some text. Get the first 100 chars of it.
throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`);
}
else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
const isXml = type.includes('xml');
const parsed = await this._parseHTML(response, isXml, crawlingContext);
return { ...parsed, isXml, response, contentType };
}
else {
const body = await (0, utilities_1.concatStreamToBuffer)(response);
return {
body,
response,
contentType,
enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }),
};
}
}
async _parseHTML(response, _isXml, _crawlingContext) {
return {
body: await (0, utilities_1.concatStreamToBuffer)(response),
};
}
/**
* Combines the provided `requestOptions` with mandatory (non-overridable) values.
*/
_getRequestOptions(request, session, proxyUrl, gotOptions) {
const requestOptions = {
url: request.url,
method: request.method,
proxyUrl,
timeout: { request: this.navigationTimeoutMillis },
sessionToken: session,
...gotOptions,
headers: { ...request.headers, ...gotOptions?.headers },
https: {
...gotOptions?.https,
rejectUnauthorized: !this.ignoreSslErrors,
},
isStream: true,
};
// Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
Reflect.deleteProperty(requestOptions.headers, 'cookie');
// TODO this is incorrect, the check for man in the middle needs to be done
// on individual proxy level, not on the `proxyConfiguration` level,
// because users can use normal + MITM proxies in a single configuration.
// Disable SSL verification for MITM proxies
if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
requestOptions.https = {
...requestOptions.https,
rejectUnauthorized: false,
};
}
if (/PATCH|POST|PUT/.test(request.method))
requestOptions.body = request.payload ?? '';
return requestOptions;
}
_encodeResponse(request, response, encoding) {
if (this.forceResponseEncoding) {
encoding = this.forceResponseEncoding;
}
else if (!encoding && this.suggestResponseEncoding) {
encoding = this.suggestResponseEncoding;
}
// Fall back to utf-8 if we still don't have encoding.
const utf8 = 'utf8';
if (!encoding)
return { response, encoding: utf8 };
// This means that the encoding is one of Node.js supported
// encodings and we don't need to re-encode it.
if (Buffer.isEncoding(encoding))
return { response, encoding };
// Try to re-encode a variety of unsupported encodings to utf-8
if (iconv_lite_1.default.encodingExists(encoding)) {
const encodeStream = iconv_lite_1.default.encodeStream(utf8);
const decodeStream = iconv_lite_1.default.decodeStream(encoding).on('error', (err) => encodeStream.emit('error', err));
response.on('error', (err) => decodeStream.emit('error', err));
const encodedResponse = response.pipe(decodeStream).pipe(encodeStream);
encodedResponse.statusCode = response.statusCode;
encodedResponse.headers = response.headers;
encodedResponse.url = response.url;
return {
response: encodedResponse,
encoding: utf8,
};
}
throw new Error(`Resource ${request.url} served with unsupported charset/encoding: ${encoding}`);
}
/**
* Checks and extends supported mime types
*/
_extendSupportedMimeTypes(additionalMimeTypes) {
for (const mimeType of additionalMimeTypes) {
if (mimeType === '*/*') {
this.supportedMimeTypes.add(mimeType);
continue;
}
try {
const parsedType = content_type_1.default.parse(mimeType);
this.supportedMimeTypes.add(parsedType.type);
}
catch (err) {
throw new Error(`Can not parse mime type ${mimeType} from "options.additionalMimeTypes".`);
}
}
}
/**
* Handles timeout request
*/
_handleRequestTimeout(session) {
session?.markBad();
throw new Error(`request timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`);
}
_abortDownloadOfBody(request, response) {
const { statusCode } = response;
const { type } = parseContentTypeFromResponse(response);
// eslint-disable-next-line dot-notation -- accessing private property
const blockedStatusCodes = this.sessionPool ? this.sessionPool['blockedStatusCodes'] : [];
// if we retry the request, can the Content-Type change?
const isTransientContentType = statusCode >= 500 || blockedStatusCodes.includes(statusCode);
if (!this.supportedMimeTypes.has(type) && !this.supportedMimeTypes.has('*/*') && !isTransientContentType) {
request.noRetry = true;
throw new Error(`Resource ${request.url} served Content-Type ${type}, ` +
`but only ${Array.from(this.supportedMimeTypes).join(', ')} are allowed. Skipping resource.`);
}
}
}
exports.HttpCrawler = HttpCrawler;
Object.defineProperty(HttpCrawler, "optionsShape", {
enumerable: true,
configurable: true,
writable: true,
value: {
...basic_1.BasicCrawler.optionsShape,
handlePageFunction: ow_1.default.optional.function,
navigationTimeoutSecs: ow_1.default.optional.number,
ignoreSslErrors: ow_1.default.optional.boolean,
additionalMimeTypes: ow_1.default.optional.array.ofType(ow_1.default.string),
suggestResponseEncoding: ow_1.default.optional.string,
forceResponseEncoding: ow_1.default.optional.string,
proxyConfiguration: ow_1.default.optional.object.validate(basic_1.validators.proxyConfiguration),
persistCookiesPerSession: ow_1.default.optional.boolean,
additionalHttpErrorStatusCodes: ow_1.default.optional.array.ofType(ow_1.default.number),
ignoreHttpErrorStatusCodes: ow_1.default.optional.array.ofType(ow_1.default.number),
preNavigationHooks: ow_1.default.optional.array,
postNavigationHooks: ow_1.default.optional.array,
}
});
/**
* The stream object returned from got does not have the below properties.
* At the same time, you can't read data directly from the response stream,
* because they won't get emitted unless you also read from the primary
* got stream. To be able to work with only one stream, we move the expected props
* from the response stream to the got stream.
* @internal
*/
function addResponsePropertiesToStream(stream, response) {
const properties = [
'statusCode',
'statusMessage',
'headers',
'complete',
'httpVersion',
'rawHeaders',
'rawTrailers',
'trailers',
'url',
'request',
];
stream.on('end', () => {
// @ts-expect-error
if (stream.rawTrailers)
stream.rawTrailers = response.rawTrailers; // TODO BC with got - remove in 4.0
// @ts-expect-error
if (stream.trailers)
stream.trailers = response.trailers;
// @ts-expect-error
stream.complete = response.complete;
});
for (const prop of properties) {
if (!(prop in stream)) {
stream[prop] = response[prop];
}
}
return stream;
}
/**
* Gets parsed content type from response object
* @param response HTTP response object
*/
function parseContentTypeFromResponse(response) {
(0, ow_1.default)(response, ow_1.default.object.partialShape({
url: ow_1.default.string.url,
headers: new ow_1.ObjectPredicate(),
}));
const { url, headers } = response;
let parsedContentType;
if (headers['content-type']) {
try {
parsedContentType = content_type_1.default.parse(headers['content-type']);
}
catch {
// Can not parse content type from Content-Type header. Try to parse it from file extension.
}
}
// Parse content type from file extension as fallback
if (!parsedContentType) {
const parsedUrl = new URL(url);
const contentTypeFromExtname = mime_types_1.default.contentType((0, node_path_1.extname)(parsedUrl.pathname)) || 'application/octet-stream; charset=utf-8'; // Fallback content type, specified in https://tools.ietf.org/html/rfc7231#section-3.1.1.5
parsedContentType = content_type_1.default.parse(contentTypeFromExtname);
}
return {
type: parsedContentType.type,
charset: parsedContentType.parameters.charset,
};
}
/**
* Creates new {@apilink Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@apilink HttpCrawler}.
* Defaults to the {@apilink HttpCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<HttpCrawlingContext>()`.
*
* ```ts
* import { HttpCrawler, createHttpRouter } from 'crawlee';
*
* const router = createHttpRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new HttpCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
function createHttpRouter(routes) {
return basic_1.Router.create(routes);
}
//# sourceMappingURL=http-crawler.js.map