website-scrap-engine
Version:
Configurable website scraper in typescript
70 lines • 2.39 kB
JavaScript
import { load } from 'cheerio';
import { ResourceType } from '../resource.js';
import { toString } from '../util.js';
export const skipProcess = (fn) => (url, element, parent) => fn(url, element, parent) ? undefined : url;
export const dropResource = (fn) => res => {
if (fn(res)) {
res.shouldBeDiscardedFromDownload = true;
}
return res;
};
export const preProcess = (fn) => (res, element, parent) => {
fn(res.url, element, res, parent);
return res;
};
export const requestRedirect = (fn) => res => {
if (res.downloadLink) {
const downloadLink = fn(res.downloadLink, res) || undefined;
if (!downloadLink) {
return;
}
res.downloadLink = downloadLink;
}
return res;
};
export const redirectFilter = (fn) => res => {
if (res.redirectedUrl) {
res.redirectedUrl = fn(res.redirectedUrl, res) || undefined;
}
return res;
};
export async function processRedirectedUrl(res, submit, options, pipeline) {
if (res.redirectedUrl && res.redirectedUrl !== res.url) {
const redirectedRes = await pipeline.createAndProcessResource(res.redirectedUrl, res.type, res.depth, null, res);
if (redirectedRes) {
res.redirectedUrl = redirectedRes.url;
// https://github.com/website-local/website-scrap-engine/issues/385
// 2011/11/15
if (redirectedRes.savePath) {
res.redirectedSavePath = redirectedRes.savePath;
}
}
}
return res;
}
export const parseHtml = (res, options) => {
const encoding = res.encoding || options.encoding[res.type] || 'utf8';
if (options.cheerioParse) {
return load(toString(res.body, encoding), options.cheerioParse);
}
return load(toString(res.body, encoding));
};
export const processHtml = (fn) => (res, submit, options) => {
if (res.type === ResourceType.Html) {
if (!res.meta.doc) {
res.meta.doc = parseHtml(res, options);
}
res.meta.doc = fn(res.meta.doc, res);
}
return res;
};
export const processHtmlAsync = (fn) => async (res, submit, options) => {
if (res.type === ResourceType.Html) {
if (!res.meta.doc) {
res.meta.doc = parseHtml(res, options);
}
res.meta.doc = await fn(res.meta.doc, res);
}
return res;
};
//# sourceMappingURL=adapters.js.map