UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

515 lines • 20.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.SitemapRequestList = void 0; const tslib_1 = require("tslib"); const node_stream_1 = require("node:stream"); const utils_1 = require("@crawlee/utils"); const minimatch_1 = require("minimatch"); const ow_1 = tslib_1.__importDefault(require("ow")); const log_1 = tslib_1.__importDefault(require("@apify/log")); const configuration_1 = require("../configuration"); const enqueue_links_1 = require("../enqueue_links"); const request_1 = require("../request"); const key_value_store_1 = require("./key_value_store"); const utils_2 = require("./utils"); /** @internal */ const STATE_PERSISTENCE_KEY = 'SITEMAP_REQUEST_LIST_STATE'; /** * A list of URLs to crawl parsed from a sitemap. * * The loading of the sitemap is performed in the background so that crawling can start before the sitemap is fully loaded. */ class SitemapRequestList { /** @internal */ constructor(options) { /** * Set of URLs that were returned by `fetchNextRequest()` and not marked as handled yet. * @internal */ Object.defineProperty(this, "inProgress", { enumerable: true, configurable: true, writable: true, value: new Set() }); /** Set of URLs for which `reclaimRequest()` was called. */ Object.defineProperty(this, "reclaimed", { enumerable: true, configurable: true, writable: true, value: new Set() }); /** * Map of returned Request objects that have not been marked as handled yet. * * We use this to persist custom user fields on the in-progress (or reclaimed) requests. */ Object.defineProperty(this, "requestData", { enumerable: true, configurable: true, writable: true, value: new Map() }); /** * Object for keeping track of the sitemap parsing progress. */ Object.defineProperty(this, "sitemapParsingProgress", { enumerable: true, configurable: true, writable: true, value: { /** * URL of the sitemap that is currently being parsed. `null` if no sitemap is being parsed. */ inProgressSitemapUrl: null, /** * Buffer for URLs from the currently parsed sitemap. Used for tracking partially loaded sitemaps across migrations. */ inProgressEntries: new Set(), /** * Set of sitemap URLs that have not been parsed yet. If the set is empty and `inProgressSitemapUrl` is `null`, the sitemap loading is finished. */ pendingSitemapUrls: new Set(), } }); /** * Object stream of URLs parsed from the sitemaps. * Using `highWaterMark`, this can manage the speed of the sitemap loading. * * Fetch the next URL to be processed using `fetchNextRequest()`. */ Object.defineProperty(this, "urlQueueStream", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Indicates whether the request list sitemap loading was aborted. * * If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs. * The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded. * * If the loading is aborted and all the requests are handled, `isFinished()` will return `true`. */ Object.defineProperty(this, "abortLoading", { enumerable: true, configurable: true, writable: true, value: false }); /** Number of URLs that were marked as handled */ Object.defineProperty(this, "handledUrlCount", { enumerable: true, configurable: true, writable: true, value: 0 }); Object.defineProperty(this, "persistStateKey", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "store", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "closed", { enumerable: true, configurable: true, writable: true, value: false }); /** * Proxy URL to be used for sitemap loading. */ Object.defineProperty(this, "proxyUrl", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Logger instance. */ Object.defineProperty(this, "log", { enumerable: true, configurable: true, writable: true, value: log_1.default.child({ prefix: 'SitemapRequestList' }) }); Object.defineProperty(this, "urlExcludePatternObjects", { enumerable: true, configurable: true, writable: true, value: [] }); Object.defineProperty(this, "urlPatternObjects", { enumerable: true, configurable: true, writable: true, value: [] }); /** EventManager used to handle persistence */ Object.defineProperty(this, "events", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "persistenceOptions", { enumerable: true, configurable: true, writable: true, value: void 0 }); (0, ow_1.default)(options, ow_1.default.object.exactShape({ sitemapUrls: ow_1.default.array.ofType(ow_1.default.string), proxyUrl: ow_1.default.optional.string, persistStateKey: ow_1.default.optional.string, signal: ow_1.default.optional.any(), timeoutMillis: ow_1.default.optional.number, maxBufferSize: ow_1.default.optional.number, parseSitemapOptions: ow_1.default.optional.object, globs: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.hasKeys('glob'))), exclude: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.regExp, ow_1.default.object.hasKeys('glob'), ow_1.default.object.hasKeys('regexp'))), regexps: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.regExp, ow_1.default.object.hasKeys('regexp'))), config: ow_1.default.optional.object, persistenceOptions: ow_1.default.optional.object, })); const { globs, exclude, regexps, config = configuration_1.Configuration.getGlobalConfig() } = options; if (exclude?.length) { for (const excl of exclude) { if (typeof excl === 'string' || 'glob' in excl) { this.urlExcludePatternObjects.push(...(0, enqueue_links_1.constructGlobObjectsFromGlobs)([excl])); } else if (excl instanceof RegExp || 'regexp' in excl) { this.urlExcludePatternObjects.push(...(0, enqueue_links_1.constructRegExpObjectsFromRegExps)([excl])); } } } if (globs?.length) { this.urlPatternObjects.push(...(0, enqueue_links_1.constructGlobObjectsFromGlobs)(globs)); } if (regexps?.length) { this.urlPatternObjects.push(...(0, enqueue_links_1.constructRegExpObjectsFromRegExps)(regexps)); } this.persistStateKey = options.persistStateKey; this.persistenceOptions = { enable: true, ...options.persistenceOptions }; this.proxyUrl = options.proxyUrl; this.urlQueueStream = this.createNewStream(options.maxBufferSize ?? 200); this.sitemapParsingProgress.pendingSitemapUrls = new Set(options.sitemapUrls); this.events = config.getEventManager(); this.persistState = this.persistState.bind(this); } /** * Creates a new object stream with the specified highWaterMark. * @param highWaterMark High water mark for the stream (the maximum number of objects the stream will buffer). * @returns A new object stream. */ createNewStream(highWaterMark) { return new node_stream_1.Transform({ objectMode: true, highWaterMark, }).pause(); } /** * Returns a function that checks whether the provided pattern matches the closure URL. * @param url URL to be checked. * @returns A matcher function that checks whether the pattern matches the closure URL. */ matchesUrl(url) { return (patternObject) => { const { regexp, glob } = patternObject; const matchesRegex = (regexp && url.match(regexp)) || false; const matchesGlob = (glob && (0, minimatch_1.minimatch)(url, glob, { nocase: true })) || false; return Boolean(matchesRegex || matchesGlob); }; } /** * Checks whether the URL matches the `globs` / `regexps` / `exclude` provided in the `options`. * @param url URL to be checked. * @returns `true` if the URL matches the patterns, `false` otherwise. */ isUrlMatchingPatterns(url) { return (!this.urlExcludePatternObjects.some(this.matchesUrl(url)) && (this.urlPatternObjects.length === 0 || this.urlPatternObjects.some(this.matchesUrl(url)))); } /** * Adds a URL to the queue of parsed URLs. * * Blocks if the stream is full until it is drained. */ async pushNextUrl(url) { return new Promise((resolve) => { if (this.closed || (url && !this.isUrlMatchingPatterns(url))) { resolve(); return; } if (!this.urlQueueStream.push(url)) { // This doesn't work with the 'drain' event (it's not emitted for some reason). this.urlQueueStream.once('readdata', () => { resolve(); }); } else { resolve(); } }); } /** * Reads the next URL from the queue of parsed URLs. * * If the stream is empty, blocks until a new URL is pushed. * @returns The next URL from the queue or `null` if we have read all URLs. */ async readNextUrl() { return new Promise((resolve) => { if (this.closed) { resolve(null); return; } const result = this.urlQueueStream.read(); if (!result && !this.isSitemapFullyLoaded()) { this.urlQueueStream.once('readable', () => { const nextUrl = this.urlQueueStream.read(); resolve(nextUrl); }); } else { resolve(result); } this.urlQueueStream.emit('readdata'); }); } /** * Indicates whether the background processing of sitemap contents has successfully finished. * * If this is `false`, the background processing is either still in progress or was aborted. */ isSitemapFullyLoaded() { return (this.sitemapParsingProgress.inProgressSitemapUrl === null && this.sitemapParsingProgress.pendingSitemapUrls.size === 0); } /** * Start processing the sitemaps and loading the URLs. * * Resolves once all the sitemaps URLs have been fully loaded (sets `isSitemapFullyLoaded` to `true`). */ async load({ parseSitemapOptions, }) { while (!this.isSitemapFullyLoaded() && !this.abortLoading) { const sitemapUrl = this.sitemapParsingProgress.inProgressSitemapUrl ?? this.sitemapParsingProgress.pendingSitemapUrls.values().next().value; try { for await (const item of (0, utils_1.parseSitemap)([{ type: 'url', url: sitemapUrl }], this.proxyUrl, { ...parseSitemapOptions, maxDepth: 0, emitNestedSitemaps: true, })) { if (!item.originSitemapUrl) { // This is a nested sitemap this.sitemapParsingProgress.pendingSitemapUrls.add(item.loc); continue; } if (!this.sitemapParsingProgress.inProgressEntries.has(item.loc)) { await this.pushNextUrl(item.loc); this.sitemapParsingProgress.inProgressEntries.add(item.loc); } } } catch (e) { this.log.error('Error loading sitemap contents:', e); } this.sitemapParsingProgress.pendingSitemapUrls.delete(sitemapUrl); this.sitemapParsingProgress.inProgressEntries.clear(); this.sitemapParsingProgress.inProgressSitemapUrl = null; } this.urlQueueStream.end(); } /** * Open a sitemap and start processing it. * * Resolves to a new instance of `SitemapRequestList`, which **might not be fully loaded yet** - i.e. the sitemap might still be loading in the background. * * Track the loading progress using the `isSitemapFullyLoaded` property. */ static async open(options) { const requestList = new SitemapRequestList({ ...options, persistStateKey: options.persistStateKey ?? STATE_PERSISTENCE_KEY, }); await requestList.restoreState(); void requestList.load({ parseSitemapOptions: options.parseSitemapOptions }); if (requestList.persistenceOptions.enable) { requestList.events.on("persistState" /* EventType.PERSIST_STATE */, requestList.persistState); } options?.signal?.addEventListener('abort', () => { requestList.abortLoading = true; }); if (options.timeoutMillis) { setTimeout(() => { requestList.abortLoading = true; }, options.timeoutMillis); } return requestList; } /** * @inheritDoc */ length() { return this.urlQueueStream.readableLength + this.handledUrlCount - this.inProgress.size - this.reclaimed.size; } /** * @inheritDoc */ async isFinished() { return ((await this.isEmpty()) && this.inProgress.size === 0 && (this.isSitemapFullyLoaded() || this.abortLoading)); } /** * @inheritDoc */ async isEmpty() { return this.reclaimed.size === 0 && this.urlQueueStream.readableLength === 0; } /** * @inheritDoc */ handledCount() { return this.handledUrlCount; } /** * @inheritDoc */ async persistState() { if (this.persistStateKey === undefined) { return; } this.store ?? (this.store = await key_value_store_1.KeyValueStore.open()); const urlQueue = []; while (this.urlQueueStream.readableLength > 0) { const url = this.urlQueueStream.read(); if (url === null) { break; } urlQueue.push(url); } // Create a new stream, as we have read all the URLs from the current one. // Pushing the urls back to the original stream might not be possible if it has been ended. const newStream = this.createNewStream(this.urlQueueStream.readableHighWaterMark); for (const url of urlQueue) { newStream.push(url); } if (this.urlQueueStream.writableEnded) { newStream.end(); } this.urlQueueStream = newStream; await this.store.setValue(this.persistStateKey, { sitemapParsingProgress: { pendingSitemapUrls: Array.from(this.sitemapParsingProgress.pendingSitemapUrls), inProgressSitemapUrl: this.sitemapParsingProgress.inProgressSitemapUrl, inProgressEntries: Array.from(this.sitemapParsingProgress.inProgressEntries), }, urlQueue, reclaimed: [...this.inProgress, ...this.reclaimed], // In-progress and reclaimed requests will be both retried if state is restored requestData: Array.from(this.requestData.entries()), abortLoading: this.abortLoading, closed: this.closed, }); } async restoreState() { await (0, utils_2.purgeDefaultStorages)({ onlyPurgeOnce: true }); if (this.persistStateKey === undefined) { return; } this.store ?? (this.store = await key_value_store_1.KeyValueStore.open()); const state = await this.store.getValue(this.persistStateKey); if (state === null) { return; } this.reclaimed = new Set(state.reclaimed); this.sitemapParsingProgress = { pendingSitemapUrls: new Set(state.sitemapParsingProgress.pendingSitemapUrls), inProgressSitemapUrl: state.sitemapParsingProgress.inProgressSitemapUrl, inProgressEntries: new Set(state.sitemapParsingProgress.inProgressEntries), }; this.requestData = new Map(state.requestData ?? []); for (const url of state.urlQueue) { this.urlQueueStream.push(url); } this.abortLoading = state.abortLoading; this.closed = state.closed; } /** * @inheritDoc */ async fetchNextRequest() { // Try to return a reclaimed request first let nextUrl = this.reclaimed.values().next().value; if (nextUrl) { this.reclaimed.delete(nextUrl); } else { // Otherwise read next url from the stream nextUrl = await this.readNextUrl(); if (!nextUrl) { return null; } this.requestData.set(nextUrl, new request_1.Request({ url: nextUrl })); } this.inProgress.add(nextUrl); return this.requestData.get(nextUrl); } /** * @inheritDoc */ async *[Symbol.asyncIterator]() { while (!(await this.isFinished())) { const request = await this.fetchNextRequest(); if (!request) break; yield request; } } /** * @inheritDoc */ async reclaimRequest(request) { this.ensureInProgressAndNotReclaimed(request.url); this.reclaimed.add(request.url); this.inProgress.delete(request.url); } /** * Aborts the internal sitemap loading, stops the processing of the sitemap contents and drops all the pending URLs. * * Calling `fetchNextRequest()` after this method will always return `null`. */ async teardown() { this.closed = true; this.abortLoading = true; this.events.off("persistState" /* EventType.PERSIST_STATE */, this.persistState); await this.persistState(); this.urlQueueStream.emit('readdata'); // unblocks the potentially waiting `pushNextUrl` call } /** * @inheritDoc */ async markRequestHandled(request) { this.handledUrlCount += 1; this.ensureInProgressAndNotReclaimed(request.url); this.inProgress.delete(request.url); this.requestData.delete(request.url); } ensureInProgressAndNotReclaimed(url) { if (!this.inProgress.has(url)) { throw new Error(`The request is not being processed (url: ${url})`); } if (this.reclaimed.has(url)) { throw new Error(`The request was already reclaimed (url: ${url})`); } } } exports.SitemapRequestList = SitemapRequestList; //# sourceMappingURL=sitemap_request_list.js.map