crawler
Version:
Crawler is a ready-to-use web spider that works with proxies, asynchrony, rate limit, configurable request pools, jQuery, and HTTP/2 support.
139 lines • 4.42 kB
JavaScript
import { HttpProxyAgent, HttpsProxyAgent } from "hpagent";
import http2Wrapper from "http2-wrapper";
import { cleanObject, getType, isValidUrl } from "./lib/utils.js";
export const globalOnlyOptions = [
"maxConnections",
"priorityLevels",
"rateLimit",
"skipDuplicates",
"homogeneous",
"userAgents",
"silence",
];
export const crawlerOnlyOptions = [
"rateLimiterId",
"forceUTF8",
"jQuery",
"retryInterval",
"priority",
"proxy",
"retries",
"preRequest",
"callback",
"release",
"isJson",
"referer",
"rejectUnauthorized",
"userParams",
].concat(globalOnlyOptions);
export const deprecatedOptions = [
"uri",
"qs",
"strictSSL",
"incomingEncoding",
"gzip",
"jar",
"jsonReviver",
"jsonReplacer",
"skipEventRequest",
];
export const getCharset = (headers) => {
let charset = null;
const contentType = headers["content-type"];
if (contentType) {
const match = contentType.match(/charset=['"]?([\w.-]+)/i);
if (match) {
charset = match[1].trim().toLowerCase();
}
}
return charset;
};
export const getValidOptions = (options) => {
const type = getType(options);
if (type === "string") {
try {
if (isValidUrl(options))
return { url: options };
options = JSON.parse(options);
return options;
}
catch (_err) {
throw new TypeError(`Invalid options: ${JSON.stringify(options)}`);
}
}
else if (type === "object") {
const prototype = Object.getPrototypeOf(options);
if (prototype === Object.prototype || prototype === null)
return options;
}
throw new TypeError(`Invalid options: ${JSON.stringify(options)}`);
};
export const alignOptions = (options) => {
const gotOptions = {
...options,
url: options.url ?? options.uri,
searchParams: options.searchParams ?? options.qs,
decompress: options.decompress ?? options.gzip,
parseJson: options.parseJson ?? options.jsonReviver,
stringifyJson: options.stringifyJson ?? options.jsonReplacer,
cookieJar: options.cookieJar ?? options.jar,
timeout: { request: options.timeout },
};
const sslConfig = options.rejectUnauthorized ?? options.strictSSL;
if (sslConfig !== undefined) {
if (gotOptions.https === undefined) {
gotOptions.https = { rejectUnauthorized: sslConfig };
}
else {
gotOptions.https.rejectUnauthorized = sslConfig;
}
}
const defaultagent = options["proxy"] ? {
https: new HttpsProxyAgent({ proxy: options["proxy"] }),
http: new HttpProxyAgent({ proxy: options["proxy"] }),
} : undefined;
// http2 proxy
if (options.http2 === true && options.proxy) {
const { proxies: Http2Proxies } = http2Wrapper;
const protocol = options.proxy.startsWith("https") ? "https" : "http";
const http2Agent = protocol === "https"
? new Http2Proxies.Http2OverHttps({
proxyOptions: { url: options.proxy },
})
: new Http2Proxies.Http2OverHttp({
proxyOptions: { url: options.proxy },
});
gotOptions.agent = { http2: http2Agent };
}
else {
gotOptions.agent = gotOptions.agent ?? (options.proxy ? defaultagent : undefined);
}
/**
* @deprecated The support of incomingEncoding will be removed in the next major version.
*/
if (options.encoding === undefined)
options.encoding = options.incomingEncoding;
gotOptions.responseType = "buffer";
const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions);
invalidOptions.forEach(key => {
if (key in gotOptions) {
delete gotOptions[key];
}
});
const headers = gotOptions.headers;
cleanObject(gotOptions);
gotOptions.headers = headers;
if (!gotOptions.headers.referer) {
if (options.referer) {
gotOptions.headers.referer = options.referer;
}
else {
const domain = gotOptions.url.match(/^(\w+):\/\/([^/]+)/);
if (domain)
gotOptions.headers.referer = domain[0];
}
}
gotOptions.retry = { limit: 0 };
return gotOptions;
};
//# sourceMappingURL=options.js.map