UNPKG

website-scrap-engine

Version:
219 lines 7.7 kB
import got, { Options } from 'got'; import { createResource } from './resource.js'; // noinspection ES6PreferShortImport import { beforeRetryHook } from './life-cycle/download-resource.js'; import { error } from './logger/logger.js'; // noinspection ES6PreferShortImport import { adjust } from './downloader/adjust-concurrency.js'; import { configureLogger } from './logger/config-logger.js'; import { weakAssign } from './util.js'; const MAX_RETRY_DELAY = 5000; const retryErrorCodes = new Set([ // One of the timeout limits were reached. 'ETIMEDOUT', // Connection was forcibly closed by a peer. 'ECONNRESET', // Could not bind to any free port. 'EADDRINUSE', // Connection was refused by the server. 'ECONNREFUSED', // The remote side of the stream being written has been closed. 'EPIPE', // Couldn't resolve the hostname to an IP address. 'ENOTFOUND', // No internet connection. 'ENETUNREACH', // DNS lookup timed out. 'EAI_AGAIN', 'ERR_STREAM_PREMATURE_CLOSE', 'ESERVFAIL' ]); /** * If you would like to implement it yourself, * set error.retryLimitExceeded to 1 or true * if attemptCount > retryOptions.limit * or you think retry should end */ export const calculateFastDelay = (retryObject) => { const { attemptCount, retryOptions, error: err } = retryObject; if (attemptCount > retryOptions.limit) { err.retryLimitExceeded = true; return 0; } else { err.retryLimitExceeded = false; } const hasMethod = err.options && (retryOptions.methods.length ? retryOptions.methods.includes(err.options.method) : err.options.method === 'GET'); const hasErrorCode = err.code && (retryOptions.errorCodes.length ? retryOptions.errorCodes.includes(err.code) : retryErrorCodes.has(err.code)); const hasStatusCode = retryOptions.statusCodes && err.response && retryOptions.statusCodes.includes(err.response.statusCode); if (!hasMethod || (!hasErrorCode && !hasStatusCode && err.name !== 'ReadError' && err.name !== 'TimeoutError')) { if (err && !((err.name === 'HTTPError' && err.response && err.response.statusCode === 404))) { error.error('calculateDelay SKIPPED', err.name, err.code, err.event, err.message, err.response && err.response.statusCode); } return 0; } let delay = ((2 * (attemptCount - 1)) * 1000) + Math.random() * 200; if (attemptCount > 2) { delay += 1000; } if (delay > MAX_RETRY_DELAY) { delay = MAX_RETRY_DELAY + (Math.random() - 0.5) * 1000; } // 429 Too Many Requests if (err.name === 'HTTPError' && err.response && err.response.statusCode === 429) { // add random delay delay += 3000 + Math.random() * 3000; if (err.response.headers && err.response.headers['retry-after']) { let retryAfter = parseInt(err.response.headers['retry-after']); if (Number.isNaN(retryAfter)) { retryAfter = Date.parse(err.response.headers['retry-after']) - Date.now(); } else { retryAfter *= 1000; } if (!isNaN(retryAfter)) { retryAfter |= 0; if (retryAfter < 0) { retryAfter = 1; } if (retryOptions.maxRetryAfter) { if (retryAfter >= retryOptions.maxRetryAfter) { delay = retryAfter; } } else { delay = retryAfter; } } } } delay |= 0; return delay; }; const defaultOptions = { init: [], dispose: [], concurrency: 12, configureLogger, createResource, detectResourceType: [], download: [], // hack: force cast encoding: {}, linkRedirect: [], localRoot: '', maxDepth: 1, meta: { detectIncompleteHtml: '</html>' }, processAfterDownload: [], processBeforeDownload: [], req: {}, saveToDisk: [], deduplicateStripSearch: true }; export function defaultDownloadOptions(options) { const merged = weakAssign(options, defaultOptions); // merged = weakAssign(merged, defaultOptions); if (!merged.concurrency || merged.concurrency < 1) { merged.concurrency = 12; } if (!merged.req.hooks) { merged.req.hooks = {}; } if (!merged.req.hooks.beforeRetry) { merged.req.hooks.beforeRetry = [beforeRetryHook]; } if (!('maxRedirects' in merged.req)) { merged.req.maxRedirects = 15; } if (!('ignoreInvalidCookies' in merged.req)) { merged.req.ignoreInvalidCookies = true; } if (!('timeout' in merged.req) || merged.req.timeout === undefined) { merged.req.timeout = { lookup: 1000, connect: 3500, secureConnect: 4000, socket: 5000, send: 3000, response: 190000, request: 200000 }; } if (!('retry' in merged.req) || merged.req.retry === undefined) { merged.req.retry = { limit: 25, maxRetryAfter: 60000, calculateDelay: calculateFastDelay }; } else if (typeof merged.req.retry === 'number') { merged.req.retry = { limit: merged.req.retry, maxRetryAfter: 60000, calculateDelay: calculateFastDelay }; } else if (!merged.req.retry.calculateDelay) { merged.req.retry.calculateDelay = calculateFastDelay; } if (options.adjustConcurrencyPeriod && options.adjustConcurrencyPeriod > 0 && !options.adjustConcurrencyFunc) { options.adjustConcurrencyFunc = adjust; } return merged; } export function checkDownloadOptions(options) { var _a; if (!options.concurrency || options.concurrency < 1) { throw new TypeError('Bad concurrency: ' + options.concurrency); } if (!options.localRoot) { throw new TypeError('localRoot is required'); } if ((_a = options.localSrcRoot) === null || _a === void 0 ? void 0 : _a.includes('\\')) { options.localSrcRoot = options.localSrcRoot.replace(/\\/g, '/'); } if (!options.download || !options.download.length) { throw new TypeError('download life cycle is required'); } if (!options.saveToDisk || !options.saveToDisk.length) { throw new TypeError('saveToDisk life cycle is required'); } return defaultDownloadOptions(options); } export function mergeOverrideOptions(options, overrideOptions) { const opt = typeof options === 'function' ? options() : options; if (!overrideOptions) { return opt; } if (opt.meta && overrideOptions.meta) { overrideOptions.meta = Object.assign(opt.meta, overrideOptions.meta); } if (opt.req && overrideOptions.req) { const options = got.defaults.options; const mergedOptions = new Options(opt.req, overrideOptions.req, options); // New versions of got removed `mergeOptions` // Instances of `Options` can not be reused, or it will result in memory leak // Will try to find a better way as there is no public api for this // See https://github.com/website-local/website-scrap-engine/issues/1112 // eslint-disable-next-line @typescript-eslint/no-explicit-any overrideOptions.req = mergedOptions._internals; } return checkDownloadOptions(Object.assign(opt, overrideOptions)); } //# sourceMappingURL=options.js.map