website-scrap-engine
Version:
Configurable website scraper in typescript
219 lines • 7.7 kB
JavaScript
import got, { Options } from 'got';
import { createResource } from './resource.js';
// noinspection ES6PreferShortImport
import { beforeRetryHook } from './life-cycle/download-resource.js';
import { error } from './logger/logger.js';
// noinspection ES6PreferShortImport
import { adjust } from './downloader/adjust-concurrency.js';
import { configureLogger } from './logger/config-logger.js';
import { weakAssign } from './util.js';
const MAX_RETRY_DELAY = 5000;
const retryErrorCodes = new Set([
// One of the timeout limits were reached.
'ETIMEDOUT',
// Connection was forcibly closed by a peer.
'ECONNRESET',
// Could not bind to any free port.
'EADDRINUSE',
// Connection was refused by the server.
'ECONNREFUSED',
// The remote side of the stream being written has been closed.
'EPIPE',
// Couldn't resolve the hostname to an IP address.
'ENOTFOUND',
// No internet connection.
'ENETUNREACH',
// DNS lookup timed out.
'EAI_AGAIN',
'ERR_STREAM_PREMATURE_CLOSE',
'ESERVFAIL'
]);
/**
* If you would like to implement it yourself,
* set error.retryLimitExceeded to 1 or true
* if attemptCount > retryOptions.limit
* or you think retry should end
*/
export const calculateFastDelay = (retryObject) => {
const { attemptCount, retryOptions, error: err } = retryObject;
if (attemptCount > retryOptions.limit) {
err.retryLimitExceeded = true;
return 0;
}
else {
err.retryLimitExceeded = false;
}
const hasMethod = err.options &&
(retryOptions.methods.length ?
retryOptions.methods.includes(err.options.method) :
err.options.method === 'GET');
const hasErrorCode = err.code &&
(retryOptions.errorCodes.length ?
retryOptions.errorCodes.includes(err.code) :
retryErrorCodes.has(err.code));
const hasStatusCode = retryOptions.statusCodes &&
err.response &&
retryOptions.statusCodes.includes(err.response.statusCode);
if (!hasMethod || (!hasErrorCode && !hasStatusCode &&
err.name !== 'ReadError' && err.name !== 'TimeoutError')) {
if (err && !((err.name === 'HTTPError' &&
err.response && err.response.statusCode === 404))) {
error.error('calculateDelay SKIPPED', err.name, err.code, err.event, err.message, err.response && err.response.statusCode);
}
return 0;
}
let delay = ((2 * (attemptCount - 1)) * 1000) + Math.random() * 200;
if (attemptCount > 2) {
delay += 1000;
}
if (delay > MAX_RETRY_DELAY) {
delay = MAX_RETRY_DELAY + (Math.random() - 0.5) * 1000;
}
// 429 Too Many Requests
if (err.name === 'HTTPError' &&
err.response && err.response.statusCode === 429) {
// add random delay
delay += 3000 + Math.random() * 3000;
if (err.response.headers &&
err.response.headers['retry-after']) {
let retryAfter = parseInt(err.response.headers['retry-after']);
if (Number.isNaN(retryAfter)) {
retryAfter = Date.parse(err.response.headers['retry-after']) - Date.now();
}
else {
retryAfter *= 1000;
}
if (!isNaN(retryAfter)) {
retryAfter |= 0;
if (retryAfter < 0) {
retryAfter = 1;
}
if (retryOptions.maxRetryAfter) {
if (retryAfter >= retryOptions.maxRetryAfter) {
delay = retryAfter;
}
}
else {
delay = retryAfter;
}
}
}
}
delay |= 0;
return delay;
};
const defaultOptions = {
init: [],
dispose: [],
concurrency: 12,
configureLogger,
createResource,
detectResourceType: [],
download: [],
// hack: force cast
encoding: {},
linkRedirect: [],
localRoot: '',
maxDepth: 1,
meta: {
detectIncompleteHtml: '</html>'
},
processAfterDownload: [],
processBeforeDownload: [],
req: {},
saveToDisk: [],
deduplicateStripSearch: true
};
export function defaultDownloadOptions(options) {
const merged = weakAssign(options, defaultOptions);
// merged = weakAssign(merged, defaultOptions);
if (!merged.concurrency || merged.concurrency < 1) {
merged.concurrency = 12;
}
if (!merged.req.hooks) {
merged.req.hooks = {};
}
if (!merged.req.hooks.beforeRetry) {
merged.req.hooks.beforeRetry = [beforeRetryHook];
}
if (!('maxRedirects' in merged.req)) {
merged.req.maxRedirects = 15;
}
if (!('ignoreInvalidCookies' in merged.req)) {
merged.req.ignoreInvalidCookies = true;
}
if (!('timeout' in merged.req) || merged.req.timeout === undefined) {
merged.req.timeout = {
lookup: 1000,
connect: 3500,
secureConnect: 4000,
socket: 5000,
send: 3000,
response: 190000,
request: 200000
};
}
if (!('retry' in merged.req) || merged.req.retry === undefined) {
merged.req.retry = {
limit: 25,
maxRetryAfter: 60000,
calculateDelay: calculateFastDelay
};
}
else if (typeof merged.req.retry === 'number') {
merged.req.retry = {
limit: merged.req.retry,
maxRetryAfter: 60000,
calculateDelay: calculateFastDelay
};
}
else if (!merged.req.retry.calculateDelay) {
merged.req.retry.calculateDelay = calculateFastDelay;
}
if (options.adjustConcurrencyPeriod &&
options.adjustConcurrencyPeriod > 0 &&
!options.adjustConcurrencyFunc) {
options.adjustConcurrencyFunc = adjust;
}
return merged;
}
export function checkDownloadOptions(options) {
var _a;
if (!options.concurrency || options.concurrency < 1) {
throw new TypeError('Bad concurrency: ' + options.concurrency);
}
if (!options.localRoot) {
throw new TypeError('localRoot is required');
}
if ((_a = options.localSrcRoot) === null || _a === void 0 ? void 0 : _a.includes('\\')) {
options.localSrcRoot = options.localSrcRoot.replace(/\\/g, '/');
}
if (!options.download || !options.download.length) {
throw new TypeError('download life cycle is required');
}
if (!options.saveToDisk || !options.saveToDisk.length) {
throw new TypeError('saveToDisk life cycle is required');
}
return defaultDownloadOptions(options);
}
export function mergeOverrideOptions(options, overrideOptions) {
const opt = typeof options === 'function' ? options() : options;
if (!overrideOptions) {
return opt;
}
if (opt.meta && overrideOptions.meta) {
overrideOptions.meta = Object.assign(opt.meta, overrideOptions.meta);
}
if (opt.req && overrideOptions.req) {
const options = got.defaults.options;
const mergedOptions = new Options(opt.req, overrideOptions.req, options);
// New versions of got removed `mergeOptions`
// Instances of `Options` can not be reused, or it will result in memory leak
// Will try to find a better way as there is no public api for this
// See https://github.com/website-local/website-scrap-engine/issues/1112
// eslint-disable-next-line @typescript-eslint/no-explicit-any
overrideOptions.req = mergedOptions._internals;
}
return checkDownloadOptions(Object.assign(opt, overrideOptions));
}
//# sourceMappingURL=options.js.map