UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

349 lines • 14.1 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Sitemap = void 0; exports.parseSitemap = parseSitemap; const tslib_1 = require("tslib"); const node_crypto_1 = require("node:crypto"); const node_stream_1 = require("node:stream"); const node_string_decoder_1 = require("node:string_decoder"); const node_zlib_1 = require("node:zlib"); const sax_1 = tslib_1.__importDefault(require("sax")); const whatwg_mimetype_1 = tslib_1.__importDefault(require("whatwg-mimetype")); const log_1 = tslib_1.__importDefault(require("@apify/log")); class SitemapTxtParser extends node_stream_1.Transform { constructor() { super({ readableObjectMode: true, transform: (chunk, _encoding, callback) => { this.processBuffer(this.decoder.write(chunk), false); callback(); }, flush: (callback) => { this.processBuffer(this.decoder.end(), true); callback(); }, }); Object.defineProperty(this, "decoder", { enumerable: true, configurable: true, writable: true, value: new node_string_decoder_1.StringDecoder('utf8') }); Object.defineProperty(this, "buffer", { enumerable: true, configurable: true, writable: true, value: '' }); } processBuffer(input, finalize) { this.buffer += input; if (finalize || this.buffer.includes('\n')) { const parts = this.buffer .split('\n') .map((part) => part.trim()) .filter((part) => part.length > 0); if (finalize) { for (const url of parts) { this.push({ type: 'url', loc: url }); } this.buffer = ''; } else if (parts.length > 0) { for (const url of parts.slice(0, -1)) { this.push({ type: 'url', loc: url }); } this.buffer = parts.at(-1); } } } } class SitemapXmlParser extends node_stream_1.Transform { constructor() { super({ readableObjectMode: true, transform: (chunk, _encoding, callback) => { this.parser.write(this.decoder.write(chunk)); callback(); }, flush: (callback) => { const rest = this.decoder.end(); if (rest.length > 0) { this.parser.write(rest); } this.parser.end(); callback(); }, }); Object.defineProperty(this, "decoder", { enumerable: true, configurable: true, writable: true, value: new node_string_decoder_1.StringDecoder('utf8') }); Object.defineProperty(this, "parser", { enumerable: true, configurable: true, writable: true, value: new sax_1.default.SAXParser(true) }); Object.defineProperty(this, "rootTagName", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "currentTag", { enumerable: true, configurable: true, writable: true, value: undefined }); Object.defineProperty(this, "url", { enumerable: true, configurable: true, writable: true, value: {} }); this.parser.onopentag = this.onOpenTag.bind(this); this.parser.onclosetag = this.onCloseTag.bind(this); this.parser.ontext = this.onText.bind(this); this.parser.oncdata = this.onText.bind(this); this.parser.onerror = this.destroy.bind(this); } onOpenTag(node) { if (this.rootTagName !== undefined) { if (node.name === 'loc' || node.name === 'lastmod' || node.name === 'priority' || node.name === 'changefreq') { this.currentTag = node.name; } } if (node.name === 'urlset') { this.rootTagName = 'urlset'; } if (node.name === 'sitemapindex') { this.rootTagName = 'sitemapindex'; } } onCloseTag(name) { if (name === 'loc' || name === 'lastmod' || name === 'priority' || name === 'changefreq') { this.currentTag = undefined; } if (name === 'url' && this.url.loc !== undefined) { this.push({ type: 'url', ...this.url, loc: this.url.loc }); this.url = {}; } } onText(text) { if (this.currentTag === 'loc') { if (this.rootTagName === 'sitemapindex') { this.push({ type: 'sitemapUrl', url: text.trim() }); } if (this.rootTagName === 'urlset') { this.url ?? (this.url = {}); this.url.loc = text.trim(); } } text = text.trim(); if (this.currentTag === 'lastmod') { this.url.lastmod = new Date(text); } if (this.currentTag === 'priority') { this.url.priority = Number(text); } if (this.currentTag === 'changefreq') { if (['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'].includes(text)) { this.url.changefreq = text; } } } } async function* parseSitemap(initialSources, proxyUrl, options) { const { gotScraping } = await import('got-scraping'); const { fileTypeStream } = await import('file-type'); const { emitNestedSitemaps = false, maxDepth = Infinity, sitemapRetries = 3, networkTimeouts, reportNetworkErrors = true, } = options ?? {}; const sources = [...initialSources]; const visitedSitemapUrls = new Set(); const createParser = (contentType = '', url) => { let mimeType; try { mimeType = new whatwg_mimetype_1.default(contentType); } catch { mimeType = null; } if (mimeType?.isXML() || url?.pathname.endsWith('.xml')) { return new SitemapXmlParser(); } if (mimeType?.essence === 'text/plain' || url?.pathname.endsWith('.txt')) { return new SitemapTxtParser(); } throw new Error(`Unsupported sitemap content type (contentType = ${contentType}, url = ${url?.toString()})`); }; while (sources.length > 0) { const source = sources.shift(); if ((source?.depth ?? 0) > maxDepth) { log_1.default.debug(`Skipping sitemap ${source.type === 'url' ? source.url : ''} because it reached max depth ${maxDepth}.`); continue; } let items = null; if (source.type === 'url') { const sitemapUrl = new URL(source.url); visitedSitemapUrls.add(sitemapUrl.toString()); let retriesLeft = sitemapRetries + 1; while (retriesLeft-- > 0) { try { const sitemapStream = await new Promise((resolve, reject) => { const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET', timeout: networkTimeouts, headers: { accept: 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8', }, }); request.on('response', () => resolve(request)); request.on('error', reject); }); let error = null; if (sitemapStream.response.statusCode >= 200 && sitemapStream.response.statusCode < 300) { let contentType = sitemapStream.response.headers['content-type']; const streamWithType = await fileTypeStream(sitemapStream); if (streamWithType.fileType !== undefined) { contentType = streamWithType.fileType.mime; } let isGzipped = false; if (contentType !== undefined ? contentType === 'application/gzip' : sitemapUrl.pathname.endsWith('.gz')) { isGzipped = true; if (sitemapUrl.pathname.endsWith('.gz')) { sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3); } } items = (0, node_stream_1.pipeline)(streamWithType, isGzipped ? (0, node_zlib_1.createGunzip)() : new node_stream_1.PassThrough(), createParser(contentType, sitemapUrl), (e) => { if (e !== undefined && e !== null) { error = { type: 'parser', error: e }; } }); } else { error = { type: 'fetch', error: new Error(`Failed to fetch sitemap: ${sitemapUrl}, status code: ${sitemapStream.response.statusCode}`), }; } if (error !== null) { const shouldIgnoreError = error.type === 'fetch' && !reportNetworkErrors; if (!shouldIgnoreError) { throw error.error; } } else { break; } } catch (e) { log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${retriesLeft === 0 ? 'no retries left.' : 'retrying...'} (${e})`); } } } else if (source.type === 'raw') { items = (0, node_stream_1.pipeline)(node_stream_1.Readable.from([source.content]), createParser('text/xml'), (error) => { if (error !== undefined) { log_1.default.warning(`Malformed sitemap content: ${error}`); } }); } if (items === null) { continue; } for await (const item of items) { if (item.type === 'sitemapUrl' && !visitedSitemapUrls.has(item.url)) { sources.push({ type: 'url', url: item.url, depth: (source.depth ?? 0) + 1 }); if (emitNestedSitemaps) { yield { loc: item.url, originSitemapUrl: null }; } } if (item.type === 'url') { yield { ...item, originSitemapUrl: source.type === 'url' ? source.url : `raw://${(0, node_crypto_1.createHash)('sha256').update(source.content).digest('base64')}`, }; } } } } /** * Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs. * * **Example usage:** * ```javascript * // Load a sitemap * const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']); * * // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes) * await crawler.addRequests(sitemap.urls); * ``` */ class Sitemap { constructor(urls) { Object.defineProperty(this, "urls", { enumerable: true, configurable: true, writable: true, value: urls }); } /** * Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`. * For loading based on `Sitemap` entries in `robots.txt`, the {@link RobotsTxtFile} class should be used. * @param url The domain URL to fetch the sitemap for. * @param proxyUrl A proxy to be used for fetching the sitemap file. */ static async tryCommonNames(url, proxyUrl) { const sitemapUrls = []; const sitemapUrl = new URL(url); sitemapUrl.search = ''; sitemapUrl.pathname = '/sitemap.xml'; sitemapUrls.push(sitemapUrl.toString()); sitemapUrl.pathname = '/sitemap.txt'; sitemapUrls.push(sitemapUrl.toString()); return Sitemap.load(sitemapUrls, proxyUrl, { reportNetworkErrors: false }); } /** * Fetch sitemap content from given URL or URLs and return URLs of referenced pages. * @param urls sitemap URL(s) * @param proxyUrl URL of a proxy to be used for fetching sitemap contents */ static async load(urls, proxyUrl, parseSitemapOptions) { return await this.parse((Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })), proxyUrl, parseSitemapOptions); } /** * Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP. * @param content XML sitemap content * @param proxyUrl URL of a proxy to be used for fetching sitemap contents */ static async fromXmlString(content, proxyUrl) { return await this.parse([{ type: 'raw', content }], proxyUrl); } static async parse(sources, proxyUrl, parseSitemapOptions) { const urls = []; try { for await (const item of parseSitemap(sources, proxyUrl, parseSitemapOptions)) { urls.push(item.loc); } } catch { return new Sitemap([]); } return new Sitemap(urls); } } exports.Sitemap = Sitemap; //# sourceMappingURL=sitemap.js.map