@xcrap/core
Version:
Xcrap Core is the core package of the Xcrap framework for web scraping, offering tools such as HttpClient, BaseClient, Randomizer, Rotator, and support for proxies and pagination.
109 lines (108 loc) • 5.59 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.HttpClient = void 0;
const https_proxy_agent_1 = require("https-proxy-agent");
const node_url_1 = require("node:url");
const node_https_1 = __importDefault(require("node:https"));
const node_http_1 = __importDefault(require("node:http"));
const base_client_1 = require("./base-client");
const http_response_1 = require("./http-response");
const errors_1 = require("./errors");
const constants_1 = require("./constants");
const delay_1 = require("./utils/delay");
class HttpClient extends base_client_1.BaseClient {
constructor(options = {}) {
super(options);
}
fetch({ maxRetries = 0, retries = 0, retryDelay, method = "GET", redirectCount = 0, followRedirects = true, ...options }) {
const failedAttempts = [];
const attemptRequest = async (currentRetry) => {
var _a, _b, _c, _d;
try {
return new Promise((resolve, reject) => {
var _a, _b, _c;
const url = this.currentProxyUrl ? `${this.currentProxyUrl}${options.url}` : options.url;
const urlObject = new node_url_1.URL(url);
const proxyAgent = this.currentProxy ? new https_proxy_agent_1.HttpsProxyAgent(this.currentProxy) : undefined;
const lib = urlObject.protocol === "http:" ? node_http_1.default : node_https_1.default;
const request = lib.request(urlObject, {
...options,
headers: {
...options.headers,
"user-agent": (_c = (_b = (_a = options.headers) === null || _a === void 0 ? void 0 : _a["user-agent"]) !== null && _b !== void 0 ? _b : this.currentUserAgent) !== null && _c !== void 0 ? _c : constants_1.defaultUserAgent
},
agent: proxyAgent
}, (response) => {
let data = "";
if (response.statusCode && !this.isSuccess(response.statusCode)) {
throw new errors_1.InvalidStatusCodeError(response.statusCode);
}
if (followRedirects && [301, 302, 303, 307, 308].includes(response.statusCode) && response.headers.location) {
if (redirectCount >= 5) {
return reject(new Error("Too many redirects"));
}
const newUrl = new node_url_1.URL(response.headers.location, urlObject).href;
return resolve(this.fetch({
...options,
url: newUrl,
redirectCount: redirectCount + 1
}));
}
response.on("data", (chunk) => data += chunk);
response.on("end", () => resolve(new http_response_1.HttpResponse({
body: data,
headers: response.headers,
status: response.statusCode || 200,
statusText: response.statusMessage || "ok",
attempts: currentRetry + 1,
failedAttempts: failedAttempts,
})));
});
request.on("error", reject);
request.end();
});
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : "Unknown error";
failedAttempts.push({ error: errorMessage, timestamp: new Date() });
if (currentRetry < maxRetries) {
if (retryDelay !== undefined && retryDelay > 0) {
await (0, delay_1.delay)(retryDelay);
}
return await attemptRequest(currentRetry + 1);
}
return new http_response_1.HttpResponse({
status: ((_a = error.response) === null || _a === void 0 ? void 0 : _a.status) || 500,
statusText: ((_b = error.response) === null || _b === void 0 ? void 0 : _b.statusText) || "Request Failed",
body: ((_c = error.response) === null || _c === void 0 ? void 0 : _c.data) || errorMessage,
headers: ((_d = error.response) === null || _d === void 0 ? void 0 : _d.headers) || {},
attempts: currentRetry + 1,
failedAttempts,
});
}
};
return attemptRequest(retries);
}
async fetchMany({ requests, concurrency, requestDelay }) {
const results = [];
const executing = [];
for (let i = 0; i < requests.length; i++) {
const promise = this.executeRequest({
request: requests[i],
index: i,
requestDelay: requestDelay,
results: results
}).then(() => undefined);
executing.push(promise);
if (this.shouldThrottle(executing, concurrency)) {
await this.handleConcurrency(executing);
}
}
await Promise.all(executing);
return results;
}
}
exports.HttpClient = HttpClient;