website-scrap-engine
Version:
Configurable website scraper in typescript
233 lines (219 loc) • 6.81 kB
text/typescript
import type {StaticDownloadOptions} from '../options.js';
import type {
CreateResourceArgument,
Resource,
ResourceEncoding,
ResourceType
} from '../resource.js';
import type {
DownloadResource,
ProcessingLifeCycle,
RequestOptions,
SubmitResourceFunc
} from '../life-cycle/types.js';
// noinspection ES6PreferShortImport
import type {PipelineExecutor} from '../life-cycle/pipeline-executor.js';
import type {Cheerio} from '../types.js';
import type {DownloaderWithMeta} from './types.js';
import type {WorkerInfo} from './worker-pool.js';
/**
* Pipeline executor
*/
export class PipelineExecutorImpl implements PipelineExecutor {
constructor(public lifeCycle: ProcessingLifeCycle,
public requestOptions: RequestOptions,
public options: StaticDownloadOptions) {
}
async init(
pipeline: PipelineExecutor,
downloader?: DownloaderWithMeta
): Promise<void> {
if (!this.lifeCycle.init) return;
for (const init of this.lifeCycle.init) {
await init(pipeline, downloader);
}
}
async createAndProcessResource(
rawUrl: string,
defaultType: ResourceType,
depth: number | void | null,
element: Cheerio | null,
parent: Resource
): Promise<Resource | void> {
const url: string | void = await this.linkRedirect(rawUrl, element, parent);
if (!url) return;
const type = await this.detectResourceType(url, defaultType, element, parent);
if (!type) return;
const refUrl = parent.redirectedUrl || parent.url;
const savePath = refUrl === parent.url ? parent.savePath : undefined;
const r = await this.createResource(type, depth || parent.depth + 1, url,
refUrl,
parent.localRoot,
this.options.encoding[type],
savePath,
parent.type);
if (!r) return;
return await this.processBeforeDownload(r, element, parent, this.options);
}
async linkRedirect(
url: string,
element: Cheerio | null,
parent: Resource | null
): Promise<string | void> {
let redirectedUrl: string | void = url;
for (const linkRedirectFunc of this.lifeCycle.linkRedirect) {
if ((redirectedUrl =
await linkRedirectFunc(redirectedUrl as string,
element, parent, this.options, this)) === undefined) {
return undefined;
}
}
return redirectedUrl;
}
async detectResourceType(
url: string,
type: ResourceType,
element: Cheerio | null,
parent: Resource | null
): Promise<ResourceType | void> {
let detectedType: ResourceType | void = type;
for (const detectResourceTypeFunc of this.lifeCycle.detectResourceType) {
if ((detectedType =
await detectResourceTypeFunc(url, detectedType as ResourceType,
element, parent, this.options, this))
=== undefined) {
return undefined;
}
}
return detectedType;
}
createResource(
type: ResourceType,
depth: number,
url: string,
refUrl: string,
localRoot?: string,
encoding?: ResourceEncoding,
refSavePath?: string,
refType?: ResourceType
): Resource {
const arg: CreateResourceArgument = {
type,
depth,
url,
refUrl,
refSavePath,
refType,
localRoot: localRoot ?? this.options.localRoot,
localSrcRoot: this.options.localSrcRoot,
encoding: encoding ?? this.options.encoding[type] ?? 'utf8',
keepSearch: !this.options.deduplicateStripSearch,
skipReplacePathError: this.options.skipReplacePathError,
generateSavePathFn: this.lifeCycle.generateSavePath,
};
return this.lifeCycle.createResource(arg);
}
async processBeforeDownload(
res: Resource,
element: Cheerio | null,
parent: Resource | null,
options?: StaticDownloadOptions
): Promise<Resource | void> {
if (!options) {
options = this.options;
}
let processedResource: Resource | void = res;
for (const processBeforeDownload of this.lifeCycle.processBeforeDownload) {
if ((processedResource =
await processBeforeDownload(processedResource as DownloadResource,
element, parent, options, this))
=== undefined) {
return undefined;
}
}
return processedResource;
}
async download(
res: Resource,
requestOptions?: RequestOptions,
options?: StaticDownloadOptions
): Promise<DownloadResource | void> {
if (res.shouldBeDiscardedFromDownload) {
return undefined;
}
if (!requestOptions) {
requestOptions = this.requestOptions;
}
if (!options) {
options = this.options;
}
let downloadedResource: DownloadResource | Resource | void = res;
for (const download of this.lifeCycle.download) {
if ((downloadedResource = await download(
downloadedResource as Resource, requestOptions, options, this))
=== undefined) {
return undefined;
}
// if downloaded, end loop and return
if ((downloadedResource as Resource)?.body) {
return downloadedResource as DownloadResource;
}
}
// not downloaded
return undefined;
}
/**
* Process resource after download, in worker thread
* @param res resource received from main thread
* @param submit function to submit resource to pipeline
* @param options
*/
async processAfterDownload(
res: DownloadResource,
submit: SubmitResourceFunc,
options?: StaticDownloadOptions
): Promise<DownloadResource | void> {
if (!options) {
options = this.options;
}
let downloadedResource: DownloadResource | void = res;
for (const processAfterDownload of this.lifeCycle.processAfterDownload) {
if ((downloadedResource = await processAfterDownload(
downloadedResource as DownloadResource, submit, options, this))
=== undefined) {
return undefined;
}
}
return downloadedResource;
}
async saveToDisk(
res: DownloadResource,
options?: StaticDownloadOptions
): Promise<DownloadResource | void> {
if (!options) {
options = this.options;
}
let downloadedResource: DownloadResource | void = res;
for (const saveToDisk of this.lifeCycle.saveToDisk) {
if ((downloadedResource = await saveToDisk(
downloadedResource as DownloadResource, options, this))
=== undefined) {
// already downloaded
return undefined;
}
}
// not downloaded
return downloadedResource;
}
async dispose(
pipeline: PipelineExecutor,
downloader: DownloaderWithMeta,
workerInfo?: WorkerInfo,
workerExitCode?: number
): Promise<void> {
if (!this.lifeCycle.dispose) return;
for (const dispose of this.lifeCycle.dispose) {
await dispose(pipeline, downloader, workerInfo, workerExitCode);
}
}
}