website-scrap-engine
Version:
Configurable website scraper in typescript
59 lines • 2.11 kB
JavaScript
import { AbstractDownloader } from './main.js';
import { skip } from '../logger/logger.js';
export class SingleThreadDownloader extends AbstractDownloader {
constructor(pathToOptions, overrideOptions) {
super(pathToOptions, overrideOptions);
this.pathToOptions = pathToOptions;
this.init = this._initOptions;
}
_internalInit(options) {
if (options.initialUrl) {
return this.addInitialResource(options.initialUrl);
}
else {
return this.pipeline.init(this.pipeline, this);
}
}
async downloadAndProcessResource(res) {
let r;
try {
r = await this.pipeline.download(res);
if (!r) {
skip.debug('discarded after download', res.url, res.rawUrl, res.refUrl);
return;
}
}
catch (e) {
this.handleError(e, 'downloading resource', res);
return;
}
this.downloadedUrl.add(res.url);
const submit = (resources) => {
if (Array.isArray(resources)) {
for (let i = 0; i < resources.length; i++) {
this._addProcessedResource(resources[i]);
}
}
else {
this._addProcessedResource(resources);
}
};
try {
const processedResource = await this.pipeline.processAfterDownload(r, submit);
if (!processedResource) {
skip.warn('skipped downloaded resource', r.url, r.refUrl);
}
else if (await this.pipeline.saveToDisk(processedResource)) {
skip.warn('downloaded resource not saved', r.url, r.refUrl);
}
if (processedResource && processedResource.redirectedUrl &&
processedResource.redirectedUrl !== processedResource.url) {
this.queuedUrl.add(processedResource.redirectedUrl);
}
}
catch (e) {
this.handleError(e, 'post-process', res);
}
}
}
//# sourceMappingURL=single.js.map