website-scrap-engine
Version:
Configurable website scraper in typescript
98 lines • 3.82 kB
JavaScript
import path from 'node:path';
import { WorkerPool } from './worker-pool.js';
import { skip } from '../logger/logger.js';
import { AbstractDownloader } from './main.js';
export class MultiThreadDownloader extends AbstractDownloader {
constructor(pathToOptions, overrideOptions, _workerFactory) {
super(pathToOptions, overrideOptions);
this.pathToOptions = pathToOptions;
this._workerFactory = _workerFactory;
this.init = this._initOptions;
this.workerDispose = [];
}
_internalInit(options) {
let workerCount = options.concurrency;
if (options.workerCount) {
workerCount = Math.min(options.workerCount, workerCount);
}
if (workerCount < 1) {
workerCount = 1;
}
const overrideOptions = options;
this._pool = new WorkerPool(workerCount,
// worker script should be compiled to .js
(overrideOptions === null || overrideOptions === void 0 ? void 0 : overrideOptions.pathToWorker) || path.resolve(__dirname, 'worker.js'), { pathToOptions: this.pathToOptions, overrideOptions }, (overrideOptions === null || overrideOptions === void 0 ? void 0 : overrideOptions.maxLoad) || -1, this._workerFactory);
for (const info of this.pool.workers) {
info.worker.addListener('exit', exitCode => this.workerDispose.push(this.pipeline.dispose(this.pipeline, this, info, exitCode)));
}
if (this.options.initialUrl) {
return this.addInitialResource(this.options.initialUrl);
}
else {
return this._initOptions.then(() => this.pipeline.init(this.pipeline, this));
}
}
get pool() {
if (this._pool) {
return this._pool;
}
throw new TypeError('MultiThreadDownloader: pool not initialized');
}
async downloadAndProcessResource(res) {
var _a;
let r;
try {
r = await this.pipeline.download(res);
if (!r) {
skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
return;
}
}
catch (e) {
this.handleError(e, 'downloading resource', res);
return false;
}
let msg;
try {
if ((ArrayBuffer.isView(r.body) || Buffer.isBuffer(r.body)) &&
r.body.byteOffset === 0 &&
r.body.byteLength === r.body.buffer.byteLength &&
r.body.buffer instanceof ArrayBuffer) {
// the array buffer view fully owns the underlying ArrayBuffer
r.body = r.body.buffer;
msg = await this.pool.submitTask(r, [r.body]);
}
else {
// lets clone and send it.
msg = await this.pool.submitTask(r);
}
}
catch (e) {
this.handleError(e, 'submitting resource to worker', res);
return false;
}
this.downloadedUrl.add(res.url);
if (!msg) {
skip.info('discarded in post-processing', res.url, res.rawUrl, res.refUrl);
return;
}
if (msg.error) {
this.handleError(msg.error, 'post-process', res);
}
if ((_a = msg.body) === null || _a === void 0 ? void 0 : _a.length) {
const body = msg.body;
body.forEach(rawRes => this._addProcessedResource(rawRes));
}
if (msg.redirectedUrl) {
this.queuedUrl.add(msg.redirectedUrl);
}
}
async dispose() {
await super.dispose();
await this.pool.dispose();
const workerDispose = this.workerDispose;
this.workerDispose = [];
await Promise.all(workerDispose);
}
}
//# sourceMappingURL=multi.js.map