website-scrap-engine
Version:
Configurable website scraper in typescript
106 lines (96 loc) • 3.76 kB
text/typescript
import {parentPort, workerData} from 'node:worker_threads';
import type {DownloadOptions, StaticDownloadOptions} from '../options.js';
import {mergeOverrideOptions} from '../options.js';
import type {
DownloadResource,
SubmitResourceFunc
} from '../life-cycle/types.js';
import type {RawResource, Resource} from '../resource.js';
import {normalizeResource, prepareResourceForClone} from '../resource.js';
import {skip} from '../logger/logger.js';
import {importDefaultFromPath} from '../util.js';
import type {DownloadWorkerMessage} from './types.js';
import {WorkerMessageType} from './types.js';
import {PipelineExecutorImpl} from './pipeline-executor-impl.js';
// noinspection ES6PreferShortImport
import type {PipelineExecutor} from '../life-cycle/pipeline-executor.js';
import type {WorkerTaskMessage} from './worker-type.js';
const {pathToOptions, overrideOptions}: {
pathToOptions: string,
overrideOptions?: Partial<StaticDownloadOptions>
} = workerData;
const asyncOptions: Promise<DownloadOptions> = importDefaultFromPath(pathToOptions);
const asyncPipeline = asyncOptions.then(options => {
options = mergeOverrideOptions(options, overrideOptions);
const pipeline: PipelineExecutor =
new PipelineExecutorImpl(options, options.req, options);
options.configureLogger(options.localRoot, options.logSubDir || '');
const init = pipeline.init(pipeline);
if (init && (init as Promise<void>).then) {
return init.then(() => pipeline);
}
return pipeline;
});
parentPort?.addListener('message', async (msg: WorkerTaskMessage<RawResource>) => {
const collectedResource: RawResource[] = [];
let error: Error | unknown | void;
let redirectedUrl: string | undefined;
try {
const pipeline = await asyncPipeline;
const res = msg.body;
const downloadResource: DownloadResource = normalizeResource(res) as DownloadResource;
const submit: SubmitResourceFunc = (resources: Resource | Resource[]) => {
if (Array.isArray(resources)) {
for (let i = 0; i < resources.length; i++) {
collectedResource.push(prepareResourceForClone(resources[i]));
}
} else {
collectedResource.push(prepareResourceForClone(resources));
}
};
const processedResource: DownloadResource | void =
await pipeline.processAfterDownload(downloadResource, submit);
if (!processedResource) {
skip.warn('skipped downloaded resource',
downloadResource.url, downloadResource.refUrl);
} else if (await pipeline.saveToDisk(processedResource)) {
skip.warn('downloaded resource not saved',
downloadResource.url, downloadResource.refUrl);
}
if (processedResource && processedResource.redirectedUrl &&
processedResource.redirectedUrl !== processedResource.url) {
redirectedUrl = processedResource.redirectedUrl;
}
} catch (e) {
// handle if object could not be cloned here
// https://github.com/website-local/website-scrap-engine/issues/340
try {
// should always be
if (typeof structuredClone === 'function') {
error = structuredClone(e);
} else {
// this is the old behavior before this
error = e;
}
} catch {
// can not clone, so no need to get the full error here
if (e && typeof e === 'object') {
const clone: Record<string, string> = {};
for (const k in e) {
clone[k] = String((e as Record<string, unknown>)[k]);
}
} else {
error = String(e);
}
}
} finally {
const message: DownloadWorkerMessage = {
taskId: msg.taskId,
type: WorkerMessageType.Complete,
body: collectedResource,
error,
redirectedUrl
};
parentPort?.postMessage(message);
}
});