website-scrap-engine
Version:
Configurable website scraper in typescript
169 lines • 6.29 kB
JavaScript
import PQueue from 'p-queue';
import URI from 'urijs';
import { mergeOverrideOptions } from '../options.js';
import { normalizeResource, ResourceType } from '../resource.js';
import { error, notFound, skip } from '../logger/logger.js';
import { importDefaultFromPath } from '../util.js';
import { PipelineExecutorImpl } from './pipeline-executor-impl.js';
export class AbstractDownloader {
constructor(pathToOptions, overrideOptions) {
this.pathToOptions = pathToOptions;
this.downloadedUrl = new Set();
this.queuedUrl = new Set();
this.meta = {
currentPeriodCount: 0,
firstPeriodCount: 0,
lastPeriodCount: 0,
lastPeriodTotalCount: 0
};
this.adjustTimer = undefined;
this._asyncOptions = importDefaultFromPath(pathToOptions);
this._overrideOptions = overrideOptions;
// A safeguard here, concurrency is set later
this.queue = new PQueue({ concurrency: 2 });
this._isInit = false;
this._initOptions = this._asyncOptions.then(options => {
options = mergeOverrideOptions(options, this._overrideOptions);
this._options = options;
// https://github.com/website-local/website-scrap-engine/issues/1113
this.queue.concurrency = options.concurrency;
this._pipeline = new PipelineExecutorImpl(options, options.req, options);
options.configureLogger(options.localRoot, options.logSubDir || '');
return this._internalInit(options).then(() => {
this._isInit = true;
});
});
}
// eslint-disable-next-line @typescript-eslint/no-unused-vars
_internalInit(options) {
return Promise.resolve();
}
get options() {
if (this._options) {
return this._options;
}
throw new TypeError('AbstractDownloader: not initialized');
}
get pipeline() {
if (this._pipeline) {
return this._pipeline;
}
throw new TypeError('AbstractDownloader: not initialized');
}
get concurrency() {
return this.queue.concurrency;
}
set concurrency(newConcurrency) {
this.queue.concurrency = newConcurrency;
}
get queueSize() {
return this.queue.size;
}
get queuePending() {
return this.queue.pending;
}
async addInitialResource(urlArr) {
if (!this._pipeline) {
// _initOptions could await addInitialResource
await this._initOptions;
}
const pipeline = this.pipeline;
await pipeline.init(pipeline, this);
// noinspection DuplicatedCode
for (let i = 0, l = urlArr.length; i < l; i++) {
let url = urlArr[i];
url = await pipeline.linkRedirect(url, null, null);
if (!url)
continue;
const type = await pipeline.detectResourceType(url, ResourceType.Html, null, null);
if (!type)
continue;
let r = await pipeline.createResource(type, 0, url, url, undefined, undefined, undefined, type);
if (!r)
continue;
r = await pipeline.processBeforeDownload(r, null, null);
if (!r)
continue;
if (!r.shouldBeDiscardedFromDownload) {
this.addProcessedResource(r);
}
}
}
_addProcessedResource(res) {
var _a;
// noinspection DuplicatedCode
if (res.depth > this.options.maxDepth) {
skip.info('skipped max depth', res.url, res.refUrl, res.depth);
return false;
}
let url;
const uri = (((_a = res === null || res === void 0 ? void 0 : res.uri) === null || _a === void 0 ? void 0 : _a.clone()) || URI(res.url)).hash('');
if (this.options.deduplicateStripSearch) {
url = uri.search('').toString();
}
else {
url = uri.toString();
}
if (this.queuedUrl.has(url)) {
return false;
}
this.queuedUrl.add(url);
const resource = normalizeResource(res);
// cut the call stack
// noinspection JSIgnoredPromiseFromCall
this.queue.add(() => new Promise(r => setImmediate(() => r(this.downloadAndProcessResource(resource)))));
}
addProcessedResource(res) {
try {
return this._addProcessedResource(res);
}
catch (e) {
this.handleError(e, 'adding resource', res);
return false;
}
}
handleError(err, cause, resource) {
var _a;
// force cast in case of typescript 4.4
if (err && err.name === 'HTTPError' &&
((_a = err === null || err === void 0 ? void 0 : err.response) === null || _a === void 0 ? void 0 : _a.statusCode) === 404) {
notFound.error(resource.url, resource.downloadLink, resource.refUrl);
}
else if (err) {
error.error(cause, resource.url, resource.downloadLink, resource.refUrl, err);
}
else {
error.error(cause, resource.url, resource.downloadLink, resource.refUrl);
}
}
get downloadedCount() {
return this.downloadedUrl.size;
}
start() {
this._initOptions.then(() => {
if (typeof this.options.adjustConcurrencyFunc === 'function') {
if (this.adjustTimer) {
clearInterval(this.adjustTimer);
}
this.adjustTimer = setInterval(() => { var _a, _b; return (_b = (_a = this.options).adjustConcurrencyFunc) === null || _b === void 0 ? void 0 : _b.call(_a, this); }, this.options.adjustConcurrencyPeriod || 60000);
}
this.queue.start();
});
}
stop() {
if (this.adjustTimer) {
clearInterval(this.adjustTimer);
}
this.queue.pause();
}
onIdle() {
return this._initOptions.then(() => this.queue.onIdle());
}
async dispose() {
var _a;
this.stop();
this.queue.clear();
await ((_a = this.pipeline) === null || _a === void 0 ? void 0 : _a.dispose(this.pipeline, this));
}
}
//# sourceMappingURL=main.js.map