UNPKG

website-scrap-engine

Version:
391 lines (357 loc) 11.4 kB
import type {RequestError, RetryFunction, RetryObject, TimeoutError} from 'got'; import got, {Options} from 'got'; import type {ResourceEncoding, ResourceType} from './resource.js'; import {createResource} from './resource.js'; import type {ProcessingLifeCycle, RequestOptions} from './life-cycle/types.js'; // noinspection ES6PreferShortImport import {beforeRetryHook} from './life-cycle/download-resource.js'; import {error} from './logger/logger.js'; // noinspection ES6PreferShortImport import {adjust} from './downloader/adjust-concurrency.js'; import {configureLogger} from './logger/config-logger.js'; import type {DownloaderWithMeta} from './downloader/types.js'; import {weakAssign} from './util.js'; import type {SourceDefinition} from './sources.js'; import type {CheerioOptionsInterface} from './types.js'; /** * Extra options for custom life cycle */ export interface StaticDownloadMeta extends Record<string, string | number | boolean| void> { detectIncompleteHtml?: '</html>' | '</body>' | string; warnForNonHtml?: boolean; } /** * Options which should not be changed at runtime, and safe for cloning */ export interface StaticDownloadOptions { /** * @see Resource.localRoot */ localRoot: string; /** * Local source path to download from, * if empty or undefined, file:// url would not be accepted, * this should not start with file:// * Note: must use slash (/) on windows * https://github.com/website-local/website-scrap-engine/issues/126 */ localSrcRoot?: string; /** * Maximum recursive depth * @see Resource.depth */ maxDepth: number; /** * Downloading concurrency */ concurrency: number; /** * Resource default encoding by type. * * Encoding of a resource can be changed at * {@link ProcessingLifeCycle.processBeforeDownload} */ encoding: Record<ResourceType, ResourceEncoding>; /** * WorkerPool.coreSize = Math.min( * {@link concurrency}, * {@link workerCount} * ) */ workerCount?: number; /** * Minimum concurrency, for {@link DownloadOptions.adjustConcurrencyFunc} */ minConcurrency?: number; /** * If url search params should be stripped. * If false, saved file name would contains search or hash of search */ deduplicateStripSearch?: boolean; /** * Optional parse option for cheerio */ cheerioParse?: CheerioOptionsInterface; /** * Optional serialize option for cheerio */ cheerioSerialize?: CheerioOptionsInterface; /** * Custom html sources */ sources?: SourceDefinition[]; /** * Got options * * Never include functions or class instances * with {@link StaticDownloadOptions}. * Configure them functions or class instances * using {@link DownloadOptions} only. * @see RequestOptions * @see got.mergeOptions */ req?: RequestOptions; /** * Urls being pushed to pipeline with depth 0 and the url self as refUrl */ initialUrl?: string[]; /** * @see DownloadOptions.configureLogger */ logSubDir?: string; /** * @see StaticDownloadMeta */ meta: StaticDownloadMeta; /** * true to skip replacePath processing in case of parser error * https://github.com/website-local/website-scrap-engine/issues/107 * @see createResource */ skipReplacePathError?: boolean; /** * Wait for this.init in method onIdle. * See https://github.com/website-local/website-scrap-engine/issues/152 * @deprecated since 0.8.2 */ waitForInitBeforeIdle?: boolean; /** * Set last modified time of local saved file with value from the * Last-Modified http header, if available in response. * * https://developer.mozilla.org/docs/Web/HTTP/Headers/Last-Modified * https://github.com/website-local/website-scrap-engine/issues/174 */ preferRemoteLastModifiedTime?: boolean; } export interface DownloadOptions extends StaticDownloadOptions, ProcessingLifeCycle { /** * Functions or class callbacks can only be here. * * @see StaticDownloadOptions.req */ req: RequestOptions; /** * Adjust downloaded concurrency at runtime. * * Note: this would not affect worker_threads */ adjustConcurrencyPeriod?: number; adjustConcurrencyFunc?: (downloader: DownloaderWithMeta) => void; /** * Use a custom function to configure logger. */ configureLogger: typeof configureLogger; } export type ExtendedError = (TimeoutError | RequestError) & { retryLimitExceeded: boolean; }; const MAX_RETRY_DELAY = 5000; const retryErrorCodes: Set<string> = new Set([ // One of the timeout limits were reached. 'ETIMEDOUT', // Connection was forcibly closed by a peer. 'ECONNRESET', // Could not bind to any free port. 'EADDRINUSE', // Connection was refused by the server. 'ECONNREFUSED', // The remote side of the stream being written has been closed. 'EPIPE', // Couldn't resolve the hostname to an IP address. 'ENOTFOUND', // No internet connection. 'ENETUNREACH', // DNS lookup timed out. 'EAI_AGAIN', 'ERR_STREAM_PREMATURE_CLOSE', 'ESERVFAIL' ]); /** * If you would like to implement it yourself, * set error.retryLimitExceeded to 1 or true * if attemptCount > retryOptions.limit * or you think retry should end */ export const calculateFastDelay: RetryFunction = (retryObject: RetryObject): number => { const {attemptCount, retryOptions, error: err} = retryObject; if (attemptCount > retryOptions.limit) { (err as ExtendedError).retryLimitExceeded = true; return 0; } else { (err as ExtendedError).retryLimitExceeded = false; } const hasMethod: boolean = err.options && (retryOptions.methods.length ? retryOptions.methods.includes(err.options.method) : err.options.method === 'GET'); const hasErrorCode = err.code && (retryOptions.errorCodes.length ? retryOptions.errorCodes.includes(err.code) : retryErrorCodes.has(err.code as string)); const hasStatusCode: undefined | boolean = retryOptions.statusCodes && err.response && retryOptions.statusCodes.includes(err.response.statusCode); if (!hasMethod || (!hasErrorCode && !hasStatusCode && err.name !== 'ReadError' && err.name !== 'TimeoutError')) { if (err && !((err.name === 'HTTPError' && err.response && err.response.statusCode === 404))) { error.error('calculateDelay SKIPPED', err.name, err.code, (err as TimeoutError).event, err.message, err.response && err.response.statusCode); } return 0; } let delay: number = ((2 * (attemptCount - 1)) * 1000) + Math.random() * 200; if (attemptCount > 2) { delay += 1000; } if (delay > MAX_RETRY_DELAY) { delay = MAX_RETRY_DELAY + (Math.random() - 0.5) * 1000; } // 429 Too Many Requests if (err.name === 'HTTPError' && err.response && err.response.statusCode === 429) { // add random delay delay += 3000 + Math.random() * 3000; if (err.response.headers && err.response.headers['retry-after']) { let retryAfter = parseInt(err.response.headers['retry-after']); if (Number.isNaN(retryAfter)) { retryAfter = Date.parse(err.response.headers['retry-after']) - Date.now(); } else { retryAfter *= 1000; } if (!isNaN(retryAfter)) { retryAfter |= 0; if (retryAfter < 0) { retryAfter = 1; } if (retryOptions.maxRetryAfter) { if (retryAfter >= retryOptions.maxRetryAfter) { delay = retryAfter; } } else { delay = retryAfter; } } } } delay |= 0; return delay; }; const defaultOptions: DownloadOptions = { init: [], dispose: [], concurrency: 12, configureLogger, createResource, detectResourceType: [], download: [], // hack: force cast encoding: {} as DownloadOptions['encoding'], linkRedirect: [], localRoot: '', maxDepth: 1, meta: { detectIncompleteHtml: '</html>' }, processAfterDownload: [], processBeforeDownload: [], req: {}, saveToDisk: [], deduplicateStripSearch: true }; export function defaultDownloadOptions( options: ProcessingLifeCycle & Partial<DownloadOptions>): DownloadOptions { const merged: DownloadOptions = weakAssign(options, defaultOptions); // merged = weakAssign(merged, defaultOptions); if (!merged.concurrency || merged.concurrency < 1) { merged.concurrency = 12; } if (!merged.req.hooks) { merged.req.hooks = {}; } if (!merged.req.hooks.beforeRetry) { merged.req.hooks.beforeRetry = [beforeRetryHook]; } if (!('maxRedirects' in merged.req)) { merged.req.maxRedirects = 15; } if (!('ignoreInvalidCookies' in merged.req)) { merged.req.ignoreInvalidCookies = true; } if (!('timeout' in merged.req) || merged.req.timeout === undefined) { merged.req.timeout = { lookup: 1000, connect: 3500, secureConnect: 4000, socket: 5000, send: 3000, response: 190000, request: 200000 }; } if (!('retry' in merged.req) || merged.req.retry === undefined) { merged.req.retry = { limit: 25, maxRetryAfter: 60000, calculateDelay: calculateFastDelay }; } else if (typeof merged.req.retry === 'number') { merged.req.retry = { limit: merged.req.retry, maxRetryAfter: 60000, calculateDelay: calculateFastDelay }; } else if (!merged.req.retry.calculateDelay) { merged.req.retry.calculateDelay = calculateFastDelay; } if (options.adjustConcurrencyPeriod && options.adjustConcurrencyPeriod > 0 && !options.adjustConcurrencyFunc) { options.adjustConcurrencyFunc = adjust; } return merged; } export function checkDownloadOptions(options: DownloadOptions): DownloadOptions { if (!options.concurrency || options.concurrency < 1) { throw new TypeError('Bad concurrency: ' + options.concurrency); } if (!options.localRoot) { throw new TypeError('localRoot is required'); } if (options.localSrcRoot?.includes('\\')) { options.localSrcRoot = options.localSrcRoot.replace(/\\/g, '/'); } if (!options.download || !options.download.length) { throw new TypeError('download life cycle is required'); } if (!options.saveToDisk || !options.saveToDisk.length) { throw new TypeError('saveToDisk life cycle is required'); } return defaultDownloadOptions(options); } export function mergeOverrideOptions( options: DownloadOptions | (() => DownloadOptions), overrideOptions?: Partial<StaticDownloadOptions>): DownloadOptions { const opt: DownloadOptions = typeof options === 'function' ? options() : options; if (!overrideOptions) { return opt; } if (opt.meta && overrideOptions.meta) { overrideOptions.meta = Object.assign(opt.meta, overrideOptions.meta); } if (opt.req && overrideOptions.req) { const options = got.defaults.options; const mergedOptions = new Options(opt.req, overrideOptions.req, options); // New versions of got removed `mergeOptions` // Instances of `Options` can not be reused, or it will result in memory leak // Will try to find a better way as there is no public api for this // See https://github.com/website-local/website-scrap-engine/issues/1112 // eslint-disable-next-line @typescript-eslint/no-explicit-any overrideOptions.req = (mergedOptions as any)._internals; } return checkDownloadOptions(Object.assign(opt, overrideOptions)); }