UNPKG

website-scrap-engine

Version:
34 lines 1.17 kB
import { load } from 'cheerio'; import { ResourceType } from '../resource.js'; import { toString } from '../util.js'; export async function processSiteMap(res, submit, options, pipeline) { if (res.type !== ResourceType.SiteMap) { return res; } const $ = load(toString(res.body, res.encoding || options.encoding[ResourceType.SiteMap] || 'utf8')); const urlSet = new Set(); const depth = res.depth + 1; // noinspection CssInvalidHtmlTagReference $('urlset url loc').each((index, obj) => { let url = $(obj).text(); if (url && (url = url.trim()) && !urlSet.has(url)) { urlSet.add(url); } }); const urls = Array.from(urlSet); const resources = []; let url, r; // noinspection DuplicatedCode for (let i = 0, l = urls.length; i < l; i++) { url = urls[i]; r = await pipeline.createAndProcessResource(url, ResourceType.Html, depth, null, res); if (!r) continue; if (!r.shouldBeDiscardedFromDownload) { resources.push(r); } } await submit(resources); return res; } //# sourceMappingURL=process-site-map.js.map