website-scrap-engine
Version:
Configurable website scraper in typescript
150 lines • 5.73 kB
JavaScript
/**
* Pipeline executor
*/
export class PipelineExecutorImpl {
constructor(lifeCycle, requestOptions, options) {
this.lifeCycle = lifeCycle;
this.requestOptions = requestOptions;
this.options = options;
}
async init(pipeline, downloader) {
if (!this.lifeCycle.init)
return;
for (const init of this.lifeCycle.init) {
await init(pipeline, downloader);
}
}
async createAndProcessResource(rawUrl, defaultType, depth, element, parent) {
const url = await this.linkRedirect(rawUrl, element, parent);
if (!url)
return;
const type = await this.detectResourceType(url, defaultType, element, parent);
if (!type)
return;
const refUrl = parent.redirectedUrl || parent.url;
const savePath = refUrl === parent.url ? parent.savePath : undefined;
const r = await this.createResource(type, depth || parent.depth + 1, url, refUrl, parent.localRoot, this.options.encoding[type], savePath, parent.type);
if (!r)
return;
return await this.processBeforeDownload(r, element, parent, this.options);
}
async linkRedirect(url, element, parent) {
let redirectedUrl = url;
for (const linkRedirectFunc of this.lifeCycle.linkRedirect) {
if ((redirectedUrl =
await linkRedirectFunc(redirectedUrl, element, parent, this.options, this)) === undefined) {
return undefined;
}
}
return redirectedUrl;
}
async detectResourceType(url, type, element, parent) {
let detectedType = type;
for (const detectResourceTypeFunc of this.lifeCycle.detectResourceType) {
if ((detectedType =
await detectResourceTypeFunc(url, detectedType, element, parent, this.options, this))
=== undefined) {
return undefined;
}
}
return detectedType;
}
createResource(type, depth, url, refUrl, localRoot, encoding, refSavePath, refType) {
var _a;
const arg = {
type,
depth,
url,
refUrl,
refSavePath,
refType,
localRoot: localRoot !== null && localRoot !== void 0 ? localRoot : this.options.localRoot,
localSrcRoot: this.options.localSrcRoot,
encoding: (_a = encoding !== null && encoding !== void 0 ? encoding : this.options.encoding[type]) !== null && _a !== void 0 ? _a : 'utf8',
keepSearch: !this.options.deduplicateStripSearch,
skipReplacePathError: this.options.skipReplacePathError,
generateSavePathFn: this.lifeCycle.generateSavePath,
};
return this.lifeCycle.createResource(arg);
}
async processBeforeDownload(res, element, parent, options) {
if (!options) {
options = this.options;
}
let processedResource = res;
for (const processBeforeDownload of this.lifeCycle.processBeforeDownload) {
if ((processedResource =
await processBeforeDownload(processedResource, element, parent, options, this))
=== undefined) {
return undefined;
}
}
return processedResource;
}
async download(res, requestOptions, options) {
if (res.shouldBeDiscardedFromDownload) {
return undefined;
}
if (!requestOptions) {
requestOptions = this.requestOptions;
}
if (!options) {
options = this.options;
}
let downloadedResource = res;
for (const download of this.lifeCycle.download) {
if ((downloadedResource = await download(downloadedResource, requestOptions, options, this))
=== undefined) {
return undefined;
}
// if downloaded, end loop and return
if (downloadedResource === null || downloadedResource === void 0 ? void 0 : downloadedResource.body) {
return downloadedResource;
}
}
// not downloaded
return undefined;
}
/**
* Process resource after download, in worker thread
* @param res resource received from main thread
* @param submit function to submit resource to pipeline
* @param options
*/
async processAfterDownload(res, submit, options) {
if (!options) {
options = this.options;
}
let downloadedResource = res;
for (const processAfterDownload of this.lifeCycle.processAfterDownload) {
if ((downloadedResource = await processAfterDownload(downloadedResource, submit, options, this))
=== undefined) {
return undefined;
}
}
return downloadedResource;
}
async saveToDisk(res, options) {
if (!options) {
options = this.options;
}
let downloadedResource = res;
for (const saveToDisk of this.lifeCycle.saveToDisk) {
if ((downloadedResource = await saveToDisk(downloadedResource, options, this))
=== undefined) {
// already downloaded
return undefined;
}
}
// not downloaded
return downloadedResource;
}
async dispose(pipeline, downloader, workerInfo, workerExitCode) {
if (!this.lifeCycle.dispose)
return;
for (const dispose of this.lifeCycle.dispose) {
await dispose(pipeline, downloader, workerInfo, workerExitCode);
}
}
}
//# sourceMappingURL=pipeline-executor-impl.js.map