website-scrap-engine
Version:
Configurable website scraper in typescript
87 lines • 3.42 kB
JavaScript
import { parentPort, workerData } from 'node:worker_threads';
import { mergeOverrideOptions } from '../options.js';
import { normalizeResource, prepareResourceForClone } from '../resource.js';
import { skip } from '../logger/logger.js';
import { importDefaultFromPath } from '../util.js';
import { WorkerMessageType } from './types.js';
import { PipelineExecutorImpl } from './pipeline-executor-impl.js';
const { pathToOptions, overrideOptions } = workerData;
const asyncOptions = importDefaultFromPath(pathToOptions);
const asyncPipeline = asyncOptions.then(options => {
options = mergeOverrideOptions(options, overrideOptions);
const pipeline = new PipelineExecutorImpl(options, options.req, options);
options.configureLogger(options.localRoot, options.logSubDir || '');
const init = pipeline.init(pipeline);
if (init && init.then) {
return init.then(() => pipeline);
}
return pipeline;
});
parentPort === null || parentPort === void 0 ? void 0 : parentPort.addListener('message', async (msg) => {
const collectedResource = [];
let error;
let redirectedUrl;
try {
const pipeline = await asyncPipeline;
const res = msg.body;
const downloadResource = normalizeResource(res);
const submit = (resources) => {
if (Array.isArray(resources)) {
for (let i = 0; i < resources.length; i++) {
collectedResource.push(prepareResourceForClone(resources[i]));
}
}
else {
collectedResource.push(prepareResourceForClone(resources));
}
};
const processedResource = await pipeline.processAfterDownload(downloadResource, submit);
if (!processedResource) {
skip.warn('skipped downloaded resource', downloadResource.url, downloadResource.refUrl);
}
else if (await pipeline.saveToDisk(processedResource)) {
skip.warn('downloaded resource not saved', downloadResource.url, downloadResource.refUrl);
}
if (processedResource && processedResource.redirectedUrl &&
processedResource.redirectedUrl !== processedResource.url) {
redirectedUrl = processedResource.redirectedUrl;
}
}
catch (e) {
// handle if object could not be cloned here
// https://github.com/website-local/website-scrap-engine/issues/340
try {
// should always be
if (typeof structuredClone === 'function') {
error = structuredClone(e);
}
else {
// this is the old behavior before this
error = e;
}
}
catch (_a) {
// can not clone, so no need to get the full error here
if (e && typeof e === 'object') {
const clone = {};
for (const k in e) {
clone[k] = String(e[k]);
}
}
else {
error = String(e);
}
}
}
finally {
const message = {
taskId: msg.taskId,
type: WorkerMessageType.Complete,
body: collectedResource,
error,
redirectedUrl
};
parentPort === null || parentPort === void 0 ? void 0 : parentPort.postMessage(message);
}
});
//# sourceMappingURL=worker.js.map