UNPKG

website-scrap-engine

Version:
169 lines 6.29 kB
import PQueue from 'p-queue'; import URI from 'urijs'; import { mergeOverrideOptions } from '../options.js'; import { normalizeResource, ResourceType } from '../resource.js'; import { error, notFound, skip } from '../logger/logger.js'; import { importDefaultFromPath } from '../util.js'; import { PipelineExecutorImpl } from './pipeline-executor-impl.js'; export class AbstractDownloader { constructor(pathToOptions, overrideOptions) { this.pathToOptions = pathToOptions; this.downloadedUrl = new Set(); this.queuedUrl = new Set(); this.meta = { currentPeriodCount: 0, firstPeriodCount: 0, lastPeriodCount: 0, lastPeriodTotalCount: 0 }; this.adjustTimer = undefined; this._asyncOptions = importDefaultFromPath(pathToOptions); this._overrideOptions = overrideOptions; // A safeguard here, concurrency is set later this.queue = new PQueue({ concurrency: 2 }); this._isInit = false; this._initOptions = this._asyncOptions.then(options => { options = mergeOverrideOptions(options, this._overrideOptions); this._options = options; // https://github.com/website-local/website-scrap-engine/issues/1113 this.queue.concurrency = options.concurrency; this._pipeline = new PipelineExecutorImpl(options, options.req, options); options.configureLogger(options.localRoot, options.logSubDir || ''); return this._internalInit(options).then(() => { this._isInit = true; }); }); } // eslint-disable-next-line @typescript-eslint/no-unused-vars _internalInit(options) { return Promise.resolve(); } get options() { if (this._options) { return this._options; } throw new TypeError('AbstractDownloader: not initialized'); } get pipeline() { if (this._pipeline) { return this._pipeline; } throw new TypeError('AbstractDownloader: not initialized'); } get concurrency() { return this.queue.concurrency; } set concurrency(newConcurrency) { this.queue.concurrency = newConcurrency; } get queueSize() { return this.queue.size; } get queuePending() { return this.queue.pending; } async addInitialResource(urlArr) { if (!this._pipeline) { // _initOptions could await addInitialResource await this._initOptions; } const pipeline = this.pipeline; await pipeline.init(pipeline, this); // noinspection DuplicatedCode for (let i = 0, l = urlArr.length; i < l; i++) { let url = urlArr[i]; url = await pipeline.linkRedirect(url, null, null); if (!url) continue; const type = await pipeline.detectResourceType(url, ResourceType.Html, null, null); if (!type) continue; let r = await pipeline.createResource(type, 0, url, url, undefined, undefined, undefined, type); if (!r) continue; r = await pipeline.processBeforeDownload(r, null, null); if (!r) continue; if (!r.shouldBeDiscardedFromDownload) { this.addProcessedResource(r); } } } _addProcessedResource(res) { var _a; // noinspection DuplicatedCode if (res.depth > this.options.maxDepth) { skip.info('skipped max depth', res.url, res.refUrl, res.depth); return false; } let url; const uri = (((_a = res === null || res === void 0 ? void 0 : res.uri) === null || _a === void 0 ? void 0 : _a.clone()) || URI(res.url)).hash(''); if (this.options.deduplicateStripSearch) { url = uri.search('').toString(); } else { url = uri.toString(); } if (this.queuedUrl.has(url)) { return false; } this.queuedUrl.add(url); const resource = normalizeResource(res); // cut the call stack // noinspection JSIgnoredPromiseFromCall this.queue.add(() => new Promise(r => setImmediate(() => r(this.downloadAndProcessResource(resource))))); } addProcessedResource(res) { try { return this._addProcessedResource(res); } catch (e) { this.handleError(e, 'adding resource', res); return false; } } handleError(err, cause, resource) { var _a; // force cast in case of typescript 4.4 if (err && err.name === 'HTTPError' && ((_a = err === null || err === void 0 ? void 0 : err.response) === null || _a === void 0 ? void 0 : _a.statusCode) === 404) { notFound.error(resource.url, resource.downloadLink, resource.refUrl); } else if (err) { error.error(cause, resource.url, resource.downloadLink, resource.refUrl, err); } else { error.error(cause, resource.url, resource.downloadLink, resource.refUrl); } } get downloadedCount() { return this.downloadedUrl.size; } start() { this._initOptions.then(() => { if (typeof this.options.adjustConcurrencyFunc === 'function') { if (this.adjustTimer) { clearInterval(this.adjustTimer); } this.adjustTimer = setInterval(() => { var _a, _b; return (_b = (_a = this.options).adjustConcurrencyFunc) === null || _b === void 0 ? void 0 : _b.call(_a, this); }, this.options.adjustConcurrencyPeriod || 60000); } this.queue.start(); }); } stop() { if (this.adjustTimer) { clearInterval(this.adjustTimer); } this.queue.pause(); } onIdle() { return this._initOptions.then(() => this.queue.onIdle()); } async dispose() { var _a; this.stop(); this.queue.clear(); await ((_a = this.pipeline) === null || _a === void 0 ? void 0 : _a.dispose(this.pipeline, this)); } } //# sourceMappingURL=main.js.map