UNPKG

crawler

Version:

Crawler is a ready-to-use web spider that works with proxies, asynchrony, rate limit, configurable request pools, jQuery, and HTTP/2 support.

139 lines 4.42 kB
import { HttpProxyAgent, HttpsProxyAgent } from "hpagent"; import http2Wrapper from "http2-wrapper"; import { cleanObject, getType, isValidUrl } from "./lib/utils.js"; export const globalOnlyOptions = [ "maxConnections", "priorityLevels", "rateLimit", "skipDuplicates", "homogeneous", "userAgents", "silence", ]; export const crawlerOnlyOptions = [ "rateLimiterId", "forceUTF8", "jQuery", "retryInterval", "priority", "proxy", "retries", "preRequest", "callback", "release", "isJson", "referer", "rejectUnauthorized", "userParams", ].concat(globalOnlyOptions); export const deprecatedOptions = [ "uri", "qs", "strictSSL", "incomingEncoding", "gzip", "jar", "jsonReviver", "jsonReplacer", "skipEventRequest", ]; export const getCharset = (headers) => { let charset = null; const contentType = headers["content-type"]; if (contentType) { const match = contentType.match(/charset=['"]?([\w.-]+)/i); if (match) { charset = match[1].trim().toLowerCase(); } } return charset; }; export const getValidOptions = (options) => { const type = getType(options); if (type === "string") { try { if (isValidUrl(options)) return { url: options }; options = JSON.parse(options); return options; } catch (_err) { throw new TypeError(`Invalid options: ${JSON.stringify(options)}`); } } else if (type === "object") { const prototype = Object.getPrototypeOf(options); if (prototype === Object.prototype || prototype === null) return options; } throw new TypeError(`Invalid options: ${JSON.stringify(options)}`); }; export const alignOptions = (options) => { const gotOptions = { ...options, url: options.url ?? options.uri, searchParams: options.searchParams ?? options.qs, decompress: options.decompress ?? options.gzip, parseJson: options.parseJson ?? options.jsonReviver, stringifyJson: options.stringifyJson ?? options.jsonReplacer, cookieJar: options.cookieJar ?? options.jar, timeout: { request: options.timeout }, }; const sslConfig = options.rejectUnauthorized ?? options.strictSSL; if (sslConfig !== undefined) { if (gotOptions.https === undefined) { gotOptions.https = { rejectUnauthorized: sslConfig }; } else { gotOptions.https.rejectUnauthorized = sslConfig; } } const defaultagent = options["proxy"] ? { https: new HttpsProxyAgent({ proxy: options["proxy"] }), http: new HttpProxyAgent({ proxy: options["proxy"] }), } : undefined; // http2 proxy if (options.http2 === true && options.proxy) { const { proxies: Http2Proxies } = http2Wrapper; const protocol = options.proxy.startsWith("https") ? "https" : "http"; const http2Agent = protocol === "https" ? new Http2Proxies.Http2OverHttps({ proxyOptions: { url: options.proxy }, }) : new Http2Proxies.Http2OverHttp({ proxyOptions: { url: options.proxy }, }); gotOptions.agent = { http2: http2Agent }; } else { gotOptions.agent = gotOptions.agent ?? (options.proxy ? defaultagent : undefined); } /** * @deprecated The support of incomingEncoding will be removed in the next major version. */ if (options.encoding === undefined) options.encoding = options.incomingEncoding; gotOptions.responseType = "buffer"; const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions); invalidOptions.forEach(key => { if (key in gotOptions) { delete gotOptions[key]; } }); const headers = gotOptions.headers; cleanObject(gotOptions); gotOptions.headers = headers; if (!gotOptions.headers.referer) { if (options.referer) { gotOptions.headers.referer = options.referer; } else { const domain = gotOptions.url.match(/^(\w+):\/\/([^/]+)/); if (domain) gotOptions.headers.referer = domain[0]; } } gotOptions.retry = { limit: 0 }; return gotOptions; }; //# sourceMappingURL=options.js.map