website-scrap-engine
Version:
Configurable website scraper in typescript
64 lines • 2.58 kB
JavaScript
import { ResourceType } from '../resource.js';
import { parseHtml } from './adapters.js';
import { skip } from '../logger/logger.js';
/**
* Originally create by https://github.com/stevenvachon at
* https://github.com/stevenvachon/http-equiv-refresh
* MIT license
*/
const META_REFRESH_PATTERN = /^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
export async function processHtmlMetaRefresh(res, submit, options, pipeline) {
if (res.type !== ResourceType.Html) {
return res;
}
if (!res.meta.doc) {
res.meta.doc = parseHtml(res, options);
}
const $ = res.meta.doc;
const metaLinks = $('meta[http-equiv="refresh"][content]');
if (metaLinks.length) {
const refUrl = res.redirectedUrl || res.url;
const savePath = refUrl === res.url ? res.savePath : undefined;
const depth = res.depth + 1;
for (let index = 0; index < metaLinks.length; index++) {
const elem = metaLinks.eq(index);
const attrValue = elem.attr('content');
if (!attrValue) {
continue;
}
const match = META_REFRESH_PATTERN.exec(attrValue);
if (!match) {
continue;
}
const originalLink = match[2] || match[3];
if (!originalLink) {
continue;
}
const link = await pipeline.linkRedirect(originalLink, elem, res);
if (!link) {
continue;
}
const linkType = await pipeline.detectResourceType(link, ResourceType.Html, elem, res);
if (!linkType) {
if (skip.isTraceEnabled()) {
skip.trace('skip detectResourceType', originalLink, link, refUrl);
}
continue;
}
let resource = await pipeline.createResource(linkType, depth, link, refUrl, res.localRoot, options.encoding[linkType], savePath, res.type);
resource = await pipeline.processBeforeDownload(resource, elem, res, options);
if (!resource) {
if (skip.isTraceEnabled()) {
skip.trace('skip processBeforeDownload', originalLink, link, linkType, refUrl);
}
continue;
}
if (!resource.shouldBeDiscardedFromDownload) {
submit(resource);
}
elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
}
}
return res;
}
//# sourceMappingURL=process-html-meta.js.map