website-scrap-engine
Version:
Configurable website scraper in typescript
77 lines • 3.16 kB
JavaScript
import { ResourceType } from '../resource.js';
import { error, skip } from '../logger/logger.js';
import { parseHtml } from './adapters.js';
import { getResourceBodyFromHtml } from './save-html-to-disk.js';
const svgSelectors = [
{ selector: '*[xlink\\:href]', attr: 'xlink:href', type: ResourceType.Binary },
{ selector: '*[href]', attr: 'href', type: ResourceType.Binary },
];
export async function processSvg(res, submit, options, pipeline) {
if (res.type !== ResourceType.Svg) {
return res;
}
const refUrl = res.redirectedUrl || res.url;
const savePath = refUrl === res.url ? res.savePath : undefined;
// useless since processRedirectedUrl enabled by default
// refUrl = await pipeline.linkRedirect(refUrl, null, res) || refUrl;
const depth = res.depth + 1;
let doc = res.meta.doc;
if (!doc) {
res.meta.doc = doc = parseHtml(res, options);
}
for (const { selector, attr, type } of svgSelectors) {
const elements = doc(selector);
for (let index = 0; index < elements.length; index++) {
const elem = elements.eq(index);
const attrValue = attr && elem.attr(attr);
if (!attr || !attrValue) {
continue;
}
const originalLink = attrValue;
let replaceValue = originalLink;
// skip empty links
if (!originalLink) {
continue;
}
const link = await pipeline.linkRedirect(originalLink, elem, res);
if (!link) {
if (skip.isTraceEnabled()) {
skip.trace('skip linkRedirect', originalLink, refUrl);
}
continue;
}
const linkType = await pipeline.detectResourceType(link, type, elem, res);
if (!linkType) {
if (skip.isTraceEnabled()) {
skip.trace('skip detectResourceType', originalLink, link, refUrl);
}
continue;
}
let resource = await pipeline.createResource(linkType, depth, link, refUrl, res.localRoot, options.encoding[linkType], savePath, res.type);
resource = await pipeline.processBeforeDownload(resource, elem, res, options);
if (!resource) {
if (skip.isTraceEnabled()) {
skip.trace('skip processBeforeDownload', originalLink, link, linkType, refUrl);
}
continue;
}
if (!resource.shouldBeDiscardedFromDownload) {
submit(resource);
}
replaceValue = resource.replacePath;
// historical workaround here
if (replaceValue === '.html' || replaceValue === '/.html') {
replaceValue = '';
}
if (attr) {
elem.attr(attr, replaceValue);
}
else {
error.warn('skip attr replace', originalLink, replaceValue, refUrl);
}
}
}
res.body = getResourceBodyFromHtml(res, options);
return res;
}
//# sourceMappingURL=process-svg.js.map