UNPKG

website-scrap-engine

Version:
150 lines 5.73 kB
/** * Pipeline executor */ export class PipelineExecutorImpl { constructor(lifeCycle, requestOptions, options) { this.lifeCycle = lifeCycle; this.requestOptions = requestOptions; this.options = options; } async init(pipeline, downloader) { if (!this.lifeCycle.init) return; for (const init of this.lifeCycle.init) { await init(pipeline, downloader); } } async createAndProcessResource(rawUrl, defaultType, depth, element, parent) { const url = await this.linkRedirect(rawUrl, element, parent); if (!url) return; const type = await this.detectResourceType(url, defaultType, element, parent); if (!type) return; const refUrl = parent.redirectedUrl || parent.url; const savePath = refUrl === parent.url ? parent.savePath : undefined; const r = await this.createResource(type, depth || parent.depth + 1, url, refUrl, parent.localRoot, this.options.encoding[type], savePath, parent.type); if (!r) return; return await this.processBeforeDownload(r, element, parent, this.options); } async linkRedirect(url, element, parent) { let redirectedUrl = url; for (const linkRedirectFunc of this.lifeCycle.linkRedirect) { if ((redirectedUrl = await linkRedirectFunc(redirectedUrl, element, parent, this.options, this)) === undefined) { return undefined; } } return redirectedUrl; } async detectResourceType(url, type, element, parent) { let detectedType = type; for (const detectResourceTypeFunc of this.lifeCycle.detectResourceType) { if ((detectedType = await detectResourceTypeFunc(url, detectedType, element, parent, this.options, this)) === undefined) { return undefined; } } return detectedType; } createResource(type, depth, url, refUrl, localRoot, encoding, refSavePath, refType) { var _a; const arg = { type, depth, url, refUrl, refSavePath, refType, localRoot: localRoot !== null && localRoot !== void 0 ? localRoot : this.options.localRoot, localSrcRoot: this.options.localSrcRoot, encoding: (_a = encoding !== null && encoding !== void 0 ? encoding : this.options.encoding[type]) !== null && _a !== void 0 ? _a : 'utf8', keepSearch: !this.options.deduplicateStripSearch, skipReplacePathError: this.options.skipReplacePathError, generateSavePathFn: this.lifeCycle.generateSavePath, }; return this.lifeCycle.createResource(arg); } async processBeforeDownload(res, element, parent, options) { if (!options) { options = this.options; } let processedResource = res; for (const processBeforeDownload of this.lifeCycle.processBeforeDownload) { if ((processedResource = await processBeforeDownload(processedResource, element, parent, options, this)) === undefined) { return undefined; } } return processedResource; } async download(res, requestOptions, options) { if (res.shouldBeDiscardedFromDownload) { return undefined; } if (!requestOptions) { requestOptions = this.requestOptions; } if (!options) { options = this.options; } let downloadedResource = res; for (const download of this.lifeCycle.download) { if ((downloadedResource = await download(downloadedResource, requestOptions, options, this)) === undefined) { return undefined; } // if downloaded, end loop and return if (downloadedResource === null || downloadedResource === void 0 ? void 0 : downloadedResource.body) { return downloadedResource; } } // not downloaded return undefined; } /** * Process resource after download, in worker thread * @param res resource received from main thread * @param submit function to submit resource to pipeline * @param options */ async processAfterDownload(res, submit, options) { if (!options) { options = this.options; } let downloadedResource = res; for (const processAfterDownload of this.lifeCycle.processAfterDownload) { if ((downloadedResource = await processAfterDownload(downloadedResource, submit, options, this)) === undefined) { return undefined; } } return downloadedResource; } async saveToDisk(res, options) { if (!options) { options = this.options; } let downloadedResource = res; for (const saveToDisk of this.lifeCycle.saveToDisk) { if ((downloadedResource = await saveToDisk(downloadedResource, options, this)) === undefined) { // already downloaded return undefined; } } // not downloaded return downloadedResource; } async dispose(pipeline, downloader, workerInfo, workerExitCode) { if (!this.lifeCycle.dispose) return; for (const dispose of this.lifeCycle.dispose) { await dispose(pipeline, downloader, workerInfo, workerExitCode); } } } //# sourceMappingURL=pipeline-executor-impl.js.map