website-scrap-engine
Version:
Configurable website scraper in typescript
34 lines • 1.17 kB
JavaScript
import { load } from 'cheerio';
import { ResourceType } from '../resource.js';
import { toString } from '../util.js';
export async function processSiteMap(res, submit, options, pipeline) {
if (res.type !== ResourceType.SiteMap) {
return res;
}
const $ = load(toString(res.body, res.encoding || options.encoding[ResourceType.SiteMap] || 'utf8'));
const urlSet = new Set();
const depth = res.depth + 1;
// noinspection CssInvalidHtmlTagReference
$('urlset url loc').each((index, obj) => {
let url = $(obj).text();
if (url && (url = url.trim()) && !urlSet.has(url)) {
urlSet.add(url);
}
});
const urls = Array.from(urlSet);
const resources = [];
let url, r;
// noinspection DuplicatedCode
for (let i = 0, l = urls.length; i < l; i++) {
url = urls[i];
r = await pipeline.createAndProcessResource(url, ResourceType.Html, depth, null, res);
if (!r)
continue;
if (!r.shouldBeDiscardedFromDownload) {
resources.push(r);
}
}
await submit(resources);
return res;
}
//# sourceMappingURL=process-site-map.js.map