UNPKG

website-scrap-engine

Version:
195 lines (178 loc) 6.17 kB
import type {OptionsInit as GotOptions} from 'got'; import type { createResource, GenerateSavePathFn, Resource, ResourceBody, ResourceType } from '../resource.js'; import type {StaticDownloadOptions} from '../options.js'; import type {PipelineExecutor} from './pipeline-executor.js'; import type {Cheerio} from '../types.js'; import type {DownloaderWithMeta} from '../downloader/types.js'; import type {WorkerInfo} from '../downloader/worker-pool.js'; export type AsyncResult<T> = T | Promise<T>; export interface InitLifeCycleFunc { /** * The init life cycle would be called: * * Each time a downloader initialized, * after configureLogger but before addInitialResource. * * Each time a worker initialized, * after configureLogger but before parentPort?.addListener. * * The init life cycle could be async, * in main thread the addInitialResource should wait for init completed, * and in worker thread the message listener should wait for * init completed before processing messages. * * @param pipeline the PipelineExecutor * @param downloader the DownloaderWithMeta when in main thread */ (pipeline: PipelineExecutor, downloader?: DownloaderWithMeta): AsyncResult<void>; } export interface LinkRedirectFunc { /** * redirect link before processing, or before child-resource creation * @see PipelineExecutor.linkRedirect * @param url * @param element source element * @param parent source resource, null for initial resource * @param options * @param pipeline * @return redirected url, * or void to skip processing and replacing to relative path */ (url: string, element: Cheerio | null, parent: Resource | null, options: StaticDownloadOptions, pipeline: PipelineExecutor): AsyncResult<string | void>; } export interface DetectResourceTypeFunc { /** * Detect and change resource type * @see PipelineExecutor.detectResourceType * @param url * @param type last detected type * @param element source element * @param parent source resource, null for initial resource * @param options * @param pipeline * @return resource type, or void to discard resource */ (url: string, type: ResourceType, element: Cheerio | null, parent: Resource | null, options: StaticDownloadOptions, pipeline: PipelineExecutor): AsyncResult<ResourceType | void>; } export interface ProcessResourceBeforeDownloadFunc { /** * Process and filter resource * @see PipelineExecutor.processBeforeDownload * @param res target resource * @param element source element * @param parent source resource, null for initial resource * @param options * @param pipeline * @return processed resource, or void to discard resource */ (res: Resource, element: Cheerio | null, parent: Resource | null, options: StaticDownloadOptions, pipeline: PipelineExecutor): AsyncResult<Resource | void>; } export type RequestOptions = GotOptions; /** * Process and filter resource, * resource should only be downloaded once, * downloaded resource would not continue pipeline. * * Downloaded resource should have {@link Resource.body} * to be treated as downloaded, resource * which passed the download pipeline without body is discarded. * * Pure-binary resource, which should never create child resource from, * can be saved to disk at here and filtered out. */ export interface DownloadResourceFunc { /** * @see PipelineExecutor.download * @param res target resource * @param requestOptions passed to got * @param options * @param pipeline * @return processed resource, or void to discard resource * @throws Error on download failures */ (res: Resource, requestOptions: RequestOptions, options: StaticDownloadOptions, pipeline: PipelineExecutor): AsyncResult<DownloadResource | Resource | void>; } export interface SubmitResourceFunc { /** * Submit resource to pipeline * @param res resource or array */ (res: Resource | Resource[]): void; } export interface DownloadResource extends Resource { body: ResourceBody; } export interface ProcessResourceAfterDownloadFunc { /** * Process resource after download, in worker thread * @see PipelineExecutor.processAfterDownload * @param res resource received from main thread * @param submit function to submit resource to pipeline * @param options * @param pipeline */ (res: DownloadResource, submit: SubmitResourceFunc, options: StaticDownloadOptions, pipeline: PipelineExecutor): AsyncResult<DownloadResource | void>; } export interface SaveToDiskFunc { /** * Save to disk * @see PipelineExecutor.saveToDisk * @param res * @param options * @param pipeline * @return void for saved to disk, Resource for not saved. */ (res: DownloadResource, options: StaticDownloadOptions, pipeline: PipelineExecutor): AsyncResult<DownloadResource | void>; } export interface DisposeLifeCycle { /** * The dispose life cycle would be called in the main thread: * Each time the dispose method of a downloader called. * Each time the exit event fired on a worker. * * @param pipeline the PipelineExecutor * @param downloader the DownloaderWithMeta * @param workerInfo the worker if it is called on worker exit event * @param workerExitCode exit code of a worker if on worker exit event */ (pipeline: PipelineExecutor, downloader: DownloaderWithMeta, workerInfo?: WorkerInfo, workerExitCode?: number): AsyncResult<void>; } export interface ProcessingLifeCycle { init: InitLifeCycleFunc[]; linkRedirect: LinkRedirectFunc[]; detectResourceType: DetectResourceTypeFunc[]; generateSavePath?: GenerateSavePathFn | void; createResource: typeof createResource; /** * link in parent resource would be replaced after this */ processBeforeDownload: ProcessResourceBeforeDownloadFunc[]; /** * The only pipeline executed in main thread for multi-thread downloader */ download: DownloadResourceFunc[]; processAfterDownload: ProcessResourceAfterDownloadFunc[]; saveToDisk: SaveToDiskFunc[]; dispose: DisposeLifeCycle[]; }