website-scrap-engine
Version:
Configurable website scraper in typescript
198 lines (175 loc) • 6.45 kB
text/typescript
import PQueue from 'p-queue';
import type {HTTPError} from 'got';
import URI from 'urijs';
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
import {mergeOverrideOptions} from '../options.js';
import type {RawResource, Resource} from '../resource.js';
import {normalizeResource, ResourceType} from '../resource.js';
import {error, notFound, skip} from '../logger/logger.js';
import {importDefaultFromPath} from '../util.js';
import type {DownloaderStats, DownloaderWithMeta} from './types.js';
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
export abstract class AbstractDownloader implements DownloaderWithMeta {
readonly queue: PQueue;
readonly _asyncOptions: Promise<DownloadOptions>;
readonly _overrideOptions?: Partial<StaticDownloadOptions> & { pathToWorker?: string };
_options?: DownloadOptions;
_isInit: boolean;
_pipeline?: PipelineExecutorImpl;
_initOptions: Promise<void>;
readonly downloadedUrl: Set<string> = new Set<string>();
readonly queuedUrl: Set<string> = new Set<string>();
readonly meta: DownloaderStats = {
currentPeriodCount: 0,
firstPeriodCount: 0,
lastPeriodCount: 0,
lastPeriodTotalCount: 0
};
adjustTimer: ReturnType<typeof setInterval> | void = undefined;
protected constructor(public pathToOptions: string,
overrideOptions?: Partial<StaticDownloadOptions> & { pathToWorker?: string }) {
this._asyncOptions = importDefaultFromPath(pathToOptions);
this._overrideOptions = overrideOptions;
// A safeguard here, concurrency is set later
this.queue = new PQueue({concurrency: 2});
this._isInit = false;
this._initOptions = this._asyncOptions.then(options => {
options = mergeOverrideOptions(options, this._overrideOptions);
this._options = options;
// https://github.com/website-local/website-scrap-engine/issues/1113
this.queue.concurrency = options.concurrency;
this._pipeline = new PipelineExecutorImpl(options, options.req, options);
options.configureLogger(options.localRoot, options.logSubDir || '');
return this._internalInit(options).then(() => {
this._isInit = true;
});
});
}
// eslint-disable-next-line @typescript-eslint/no-unused-vars
protected _internalInit(options: DownloadOptions): Promise<void> {
return Promise.resolve();
}
get options(): DownloadOptions {
if (this._options) {
return this._options;
}
throw new TypeError('AbstractDownloader: not initialized');
}
get pipeline(): PipelineExecutorImpl {
if (this._pipeline) {
return this._pipeline;
}
throw new TypeError('AbstractDownloader: not initialized');
}
get concurrency(): number {
return this.queue.concurrency;
}
set concurrency(newConcurrency: number) {
this.queue.concurrency = newConcurrency;
}
get queueSize(): number {
return this.queue.size;
}
get queuePending(): number {
return this.queue.pending;
}
async addInitialResource(urlArr: string[]): Promise<void> {
if (!this._pipeline) {
// _initOptions could await addInitialResource
await this._initOptions;
}
const pipeline = this.pipeline;
await pipeline.init(pipeline, this);
// noinspection DuplicatedCode
for (let i = 0, l = urlArr.length; i < l; i++) {
let url: string | void = urlArr[i];
url = await pipeline.linkRedirect(url, null, null);
if (!url) continue;
const type: ResourceType | void = await pipeline.detectResourceType(
url, ResourceType.Html, null, null);
if (!type) continue;
let r: Resource | void = await pipeline.createResource(
type, 0, url, url,
undefined, undefined, undefined, type);
if (!r) continue;
r = await pipeline.processBeforeDownload(r, null, null);
if (!r) continue;
if (!r.shouldBeDiscardedFromDownload) {
this.addProcessedResource(r);
}
}
}
protected _addProcessedResource(res: RawResource): boolean | void {
// noinspection DuplicatedCode
if (res.depth > this.options.maxDepth) {
skip.info('skipped max depth', res.url, res.refUrl, res.depth);
return false;
}
let url: string;
const uri: URI = ((res as Resource)?.uri?.clone() || URI(res.url)).hash('');
if (this.options.deduplicateStripSearch) {
url = uri.search('').toString();
} else {
url = uri.toString();
}
if (this.queuedUrl.has(url)) {
return false;
}
this.queuedUrl.add(url);
const resource: Resource = normalizeResource(res);
// cut the call stack
// noinspection JSIgnoredPromiseFromCall
this.queue.add(() => new Promise(r => setImmediate(
() => r(this.downloadAndProcessResource(resource)))));
}
abstract downloadAndProcessResource(res: RawResource): Promise<boolean | void>;
addProcessedResource(res: RawResource): boolean | void {
try {
return this._addProcessedResource(res);
} catch (e) {
this.handleError(e, 'adding resource', res);
return false;
}
}
handleError(err: Error | unknown | null, cause: string, resource: RawResource): void {
// force cast in case of typescript 4.4
if (err && (err as {name?: string}).name === 'HTTPError' &&
(err as HTTPError)?.response?.statusCode === 404) {
notFound.error(resource.url, resource.downloadLink, resource.refUrl);
} else if (err) {
error.error(cause, resource.url, resource.downloadLink, resource.refUrl, err);
} else {
error.error(cause, resource.url, resource.downloadLink, resource.refUrl);
}
}
get downloadedCount(): number {
return this.downloadedUrl.size;
}
start(): void {
this._initOptions.then(() => {
if (typeof this.options.adjustConcurrencyFunc === 'function') {
if (this.adjustTimer) {
clearInterval(this.adjustTimer);
}
this.adjustTimer = setInterval(
() => this.options.adjustConcurrencyFunc?.(this),
this.options.adjustConcurrencyPeriod || 60000);
}
this.queue.start();
});
}
stop(): void {
if (this.adjustTimer) {
clearInterval(this.adjustTimer);
}
this.queue.pause();
}
onIdle(): Promise<void> {
return this._initOptions.then(() => this.queue.onIdle());
}
async dispose(): Promise<void> {
this.stop();
this.queue.clear();
await this.pipeline?.dispose(this.pipeline, this);
}
}