website-scrap-engine
Version: 
Configurable website scraper in typescript
198 lines (175 loc) • 6.45 kB
text/typescript
import PQueue from 'p-queue';
import type {HTTPError} from 'got';
import URI from 'urijs';
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
import {mergeOverrideOptions} from '../options.js';
import type {RawResource, Resource} from '../resource.js';
import {normalizeResource, ResourceType} from '../resource.js';
import {error, notFound, skip} from '../logger/logger.js';
import {importDefaultFromPath} from '../util.js';
import type {DownloaderStats, DownloaderWithMeta} from './types.js';
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
export abstract class AbstractDownloader implements DownloaderWithMeta {
  readonly queue: PQueue;
  readonly _asyncOptions: Promise<DownloadOptions>;
  readonly _overrideOptions?: Partial<StaticDownloadOptions> & { pathToWorker?: string };
  _options?: DownloadOptions;
  _isInit: boolean;
  _pipeline?: PipelineExecutorImpl;
  _initOptions: Promise<void>;
  readonly downloadedUrl: Set<string> = new Set<string>();
  readonly queuedUrl: Set<string> = new Set<string>();
  readonly meta: DownloaderStats = {
    currentPeriodCount: 0,
    firstPeriodCount: 0,
    lastPeriodCount: 0,
    lastPeriodTotalCount: 0
  };
  adjustTimer: ReturnType<typeof setInterval> | void = undefined;
  protected constructor(public pathToOptions: string,
    overrideOptions?: Partial<StaticDownloadOptions> & { pathToWorker?: string }) {
    this._asyncOptions = importDefaultFromPath(pathToOptions);
    this._overrideOptions = overrideOptions;
    // A safeguard here, concurrency is set later
    this.queue = new PQueue({concurrency: 2});
    this._isInit = false;
    this._initOptions = this._asyncOptions.then(options => {
      options = mergeOverrideOptions(options, this._overrideOptions);
      this._options = options;
      // https://github.com/website-local/website-scrap-engine/issues/1113
      this.queue.concurrency = options.concurrency;
      this._pipeline = new PipelineExecutorImpl(options, options.req, options);
      options.configureLogger(options.localRoot, options.logSubDir || '');
      return this._internalInit(options).then(() => {
        this._isInit = true;
      });
    });
  }
  // eslint-disable-next-line @typescript-eslint/no-unused-vars
  protected _internalInit(options: DownloadOptions): Promise<void> {
    return Promise.resolve();
  }
  get options(): DownloadOptions {
    if (this._options) {
      return this._options;
    }
    throw new TypeError('AbstractDownloader: not initialized');
  }
  get pipeline(): PipelineExecutorImpl {
    if (this._pipeline) {
      return this._pipeline;
    }
    throw new TypeError('AbstractDownloader: not initialized');
  }
  get concurrency(): number {
    return this.queue.concurrency;
  }
  set concurrency(newConcurrency: number) {
    this.queue.concurrency = newConcurrency;
  }
  get queueSize(): number {
    return this.queue.size;
  }
  get queuePending(): number {
    return this.queue.pending;
  }
  async addInitialResource(urlArr: string[]): Promise<void> {
    if (!this._pipeline) {
      // _initOptions could await addInitialResource
      await this._initOptions;
    }
    const pipeline = this.pipeline;
    await pipeline.init(pipeline, this);
    // noinspection DuplicatedCode
    for (let i = 0, l = urlArr.length; i < l; i++) {
      let url: string | void = urlArr[i];
      url = await pipeline.linkRedirect(url, null, null);
      if (!url) continue;
      const type: ResourceType | void = await pipeline.detectResourceType(
        url, ResourceType.Html, null, null);
      if (!type) continue;
      let r: Resource | void = await pipeline.createResource(
        type, 0, url, url,
        undefined, undefined, undefined, type);
      if (!r) continue;
      r = await pipeline.processBeforeDownload(r, null, null);
      if (!r) continue;
      if (!r.shouldBeDiscardedFromDownload) {
        this.addProcessedResource(r);
      }
    }
  }
  protected _addProcessedResource(res: RawResource): boolean | void {
    // noinspection DuplicatedCode
    if (res.depth > this.options.maxDepth) {
      skip.info('skipped max depth', res.url, res.refUrl, res.depth);
      return false;
    }
    let url: string;
    const uri: URI = ((res as Resource)?.uri?.clone() || URI(res.url)).hash('');
    if (this.options.deduplicateStripSearch) {
      url = uri.search('').toString();
    } else {
      url = uri.toString();
    }
    if (this.queuedUrl.has(url)) {
      return false;
    }
    this.queuedUrl.add(url);
    const resource: Resource = normalizeResource(res);
    // cut the call stack
    // noinspection JSIgnoredPromiseFromCall
    this.queue.add(() => new Promise(r => setImmediate(
      () => r(this.downloadAndProcessResource(resource)))));
  }
  abstract downloadAndProcessResource(res: RawResource): Promise<boolean | void>;
  addProcessedResource(res: RawResource): boolean | void {
    try {
      return this._addProcessedResource(res);
    } catch (e) {
      this.handleError(e, 'adding resource', res);
      return false;
    }
  }
  handleError(err: Error | unknown | null, cause: string, resource: RawResource): void {
    // force cast in case of typescript 4.4
    if (err && (err as {name?: string}).name === 'HTTPError' &&
      (err as HTTPError)?.response?.statusCode === 404) {
      notFound.error(resource.url, resource.downloadLink, resource.refUrl);
    } else if (err) {
      error.error(cause, resource.url, resource.downloadLink, resource.refUrl, err);
    } else {
      error.error(cause, resource.url, resource.downloadLink, resource.refUrl);
    }
  }
  get downloadedCount(): number {
    return this.downloadedUrl.size;
  }
  start(): void {
    this._initOptions.then(() => {
      if (typeof this.options.adjustConcurrencyFunc === 'function') {
        if (this.adjustTimer) {
          clearInterval(this.adjustTimer);
        }
        this.adjustTimer = setInterval(
          () => this.options.adjustConcurrencyFunc?.(this),
          this.options.adjustConcurrencyPeriod || 60000);
      }
      this.queue.start();
    });
  }
  stop(): void {
    if (this.adjustTimer) {
      clearInterval(this.adjustTimer);
    }
    this.queue.pause();
  }
  onIdle(): Promise<void> {
    return this._initOptions.then(() => this.queue.onIdle());
  }
  async dispose(): Promise<void> {
    this.stop();
    this.queue.clear();
    await this.pipeline?.dispose(this.pipeline, this);
  }
}