@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
515 lines • 20.3 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.SitemapRequestList = void 0;
const tslib_1 = require("tslib");
const node_stream_1 = require("node:stream");
const utils_1 = require("@crawlee/utils");
const minimatch_1 = require("minimatch");
const ow_1 = tslib_1.__importDefault(require("ow"));
const log_1 = tslib_1.__importDefault(require("@apify/log"));
const configuration_1 = require("../configuration");
const enqueue_links_1 = require("../enqueue_links");
const request_1 = require("../request");
const key_value_store_1 = require("./key_value_store");
const utils_2 = require("./utils");
/** @internal */
const STATE_PERSISTENCE_KEY = 'SITEMAP_REQUEST_LIST_STATE';
/**
* A list of URLs to crawl parsed from a sitemap.
*
* The loading of the sitemap is performed in the background so that crawling can start before the sitemap is fully loaded.
*/
class SitemapRequestList {
/** @internal */
constructor(options) {
/**
* Set of URLs that were returned by `fetchNextRequest()` and not marked as handled yet.
* @internal
*/
Object.defineProperty(this, "inProgress", {
enumerable: true,
configurable: true,
writable: true,
value: new Set()
});
/** Set of URLs for which `reclaimRequest()` was called. */
Object.defineProperty(this, "reclaimed", {
enumerable: true,
configurable: true,
writable: true,
value: new Set()
});
/**
* Map of returned Request objects that have not been marked as handled yet.
*
* We use this to persist custom user fields on the in-progress (or reclaimed) requests.
*/
Object.defineProperty(this, "requestData", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
/**
* Object for keeping track of the sitemap parsing progress.
*/
Object.defineProperty(this, "sitemapParsingProgress", {
enumerable: true,
configurable: true,
writable: true,
value: {
/**
* URL of the sitemap that is currently being parsed. `null` if no sitemap is being parsed.
*/
inProgressSitemapUrl: null,
/**
* Buffer for URLs from the currently parsed sitemap. Used for tracking partially loaded sitemaps across migrations.
*/
inProgressEntries: new Set(),
/**
* Set of sitemap URLs that have not been parsed yet. If the set is empty and `inProgressSitemapUrl` is `null`, the sitemap loading is finished.
*/
pendingSitemapUrls: new Set(),
}
});
/**
* Object stream of URLs parsed from the sitemaps.
* Using `highWaterMark`, this can manage the speed of the sitemap loading.
*
* Fetch the next URL to be processed using `fetchNextRequest()`.
*/
Object.defineProperty(this, "urlQueueStream", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* Indicates whether the request list sitemap loading was aborted.
*
* If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs.
* The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded.
*
* If the loading is aborted and all the requests are handled, `isFinished()` will return `true`.
*/
Object.defineProperty(this, "abortLoading", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
/** Number of URLs that were marked as handled */
Object.defineProperty(this, "handledUrlCount", {
enumerable: true,
configurable: true,
writable: true,
value: 0
});
Object.defineProperty(this, "persistStateKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "store", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "closed", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
/**
* Proxy URL to be used for sitemap loading.
*/
Object.defineProperty(this, "proxyUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* Logger instance.
*/
Object.defineProperty(this, "log", {
enumerable: true,
configurable: true,
writable: true,
value: log_1.default.child({ prefix: 'SitemapRequestList' })
});
Object.defineProperty(this, "urlExcludePatternObjects", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "urlPatternObjects", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
/** EventManager used to handle persistence */
Object.defineProperty(this, "events", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "persistenceOptions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
(0, ow_1.default)(options, ow_1.default.object.exactShape({
sitemapUrls: ow_1.default.array.ofType(ow_1.default.string),
proxyUrl: ow_1.default.optional.string,
persistStateKey: ow_1.default.optional.string,
signal: ow_1.default.optional.any(),
timeoutMillis: ow_1.default.optional.number,
maxBufferSize: ow_1.default.optional.number,
parseSitemapOptions: ow_1.default.optional.object,
globs: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.hasKeys('glob'))),
exclude: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.regExp, ow_1.default.object.hasKeys('glob'), ow_1.default.object.hasKeys('regexp'))),
regexps: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.regExp, ow_1.default.object.hasKeys('regexp'))),
config: ow_1.default.optional.object,
persistenceOptions: ow_1.default.optional.object,
}));
const { globs, exclude, regexps, config = configuration_1.Configuration.getGlobalConfig() } = options;
if (exclude?.length) {
for (const excl of exclude) {
if (typeof excl === 'string' || 'glob' in excl) {
this.urlExcludePatternObjects.push(...(0, enqueue_links_1.constructGlobObjectsFromGlobs)([excl]));
}
else if (excl instanceof RegExp || 'regexp' in excl) {
this.urlExcludePatternObjects.push(...(0, enqueue_links_1.constructRegExpObjectsFromRegExps)([excl]));
}
}
}
if (globs?.length) {
this.urlPatternObjects.push(...(0, enqueue_links_1.constructGlobObjectsFromGlobs)(globs));
}
if (regexps?.length) {
this.urlPatternObjects.push(...(0, enqueue_links_1.constructRegExpObjectsFromRegExps)(regexps));
}
this.persistStateKey = options.persistStateKey;
this.persistenceOptions = { enable: true, ...options.persistenceOptions };
this.proxyUrl = options.proxyUrl;
this.urlQueueStream = this.createNewStream(options.maxBufferSize ?? 200);
this.sitemapParsingProgress.pendingSitemapUrls = new Set(options.sitemapUrls);
this.events = config.getEventManager();
this.persistState = this.persistState.bind(this);
}
/**
* Creates a new object stream with the specified highWaterMark.
* @param highWaterMark High water mark for the stream (the maximum number of objects the stream will buffer).
* @returns A new object stream.
*/
createNewStream(highWaterMark) {
return new node_stream_1.Transform({
objectMode: true,
highWaterMark,
}).pause();
}
/**
* Returns a function that checks whether the provided pattern matches the closure URL.
* @param url URL to be checked.
* @returns A matcher function that checks whether the pattern matches the closure URL.
*/
matchesUrl(url) {
return (patternObject) => {
const { regexp, glob } = patternObject;
const matchesRegex = (regexp && url.match(regexp)) || false;
const matchesGlob = (glob && (0, minimatch_1.minimatch)(url, glob, { nocase: true })) || false;
return Boolean(matchesRegex || matchesGlob);
};
}
/**
* Checks whether the URL matches the `globs` / `regexps` / `exclude` provided in the `options`.
* @param url URL to be checked.
* @returns `true` if the URL matches the patterns, `false` otherwise.
*/
isUrlMatchingPatterns(url) {
return (!this.urlExcludePatternObjects.some(this.matchesUrl(url)) &&
(this.urlPatternObjects.length === 0 || this.urlPatternObjects.some(this.matchesUrl(url))));
}
/**
* Adds a URL to the queue of parsed URLs.
*
* Blocks if the stream is full until it is drained.
*/
async pushNextUrl(url) {
return new Promise((resolve) => {
if (this.closed || (url && !this.isUrlMatchingPatterns(url))) {
resolve();
return;
}
if (!this.urlQueueStream.push(url)) {
// This doesn't work with the 'drain' event (it's not emitted for some reason).
this.urlQueueStream.once('readdata', () => {
resolve();
});
}
else {
resolve();
}
});
}
/**
* Reads the next URL from the queue of parsed URLs.
*
* If the stream is empty, blocks until a new URL is pushed.
* @returns The next URL from the queue or `null` if we have read all URLs.
*/
async readNextUrl() {
return new Promise((resolve) => {
if (this.closed) {
resolve(null);
return;
}
const result = this.urlQueueStream.read();
if (!result && !this.isSitemapFullyLoaded()) {
this.urlQueueStream.once('readable', () => {
const nextUrl = this.urlQueueStream.read();
resolve(nextUrl);
});
}
else {
resolve(result);
}
this.urlQueueStream.emit('readdata');
});
}
/**
* Indicates whether the background processing of sitemap contents has successfully finished.
*
* If this is `false`, the background processing is either still in progress or was aborted.
*/
isSitemapFullyLoaded() {
return (this.sitemapParsingProgress.inProgressSitemapUrl === null &&
this.sitemapParsingProgress.pendingSitemapUrls.size === 0);
}
/**
* Start processing the sitemaps and loading the URLs.
*
* Resolves once all the sitemaps URLs have been fully loaded (sets `isSitemapFullyLoaded` to `true`).
*/
async load({ parseSitemapOptions, }) {
while (!this.isSitemapFullyLoaded() && !this.abortLoading) {
const sitemapUrl = this.sitemapParsingProgress.inProgressSitemapUrl ??
this.sitemapParsingProgress.pendingSitemapUrls.values().next().value;
try {
for await (const item of (0, utils_1.parseSitemap)([{ type: 'url', url: sitemapUrl }], this.proxyUrl, {
...parseSitemapOptions,
maxDepth: 0,
emitNestedSitemaps: true,
})) {
if (!item.originSitemapUrl) {
// This is a nested sitemap
this.sitemapParsingProgress.pendingSitemapUrls.add(item.loc);
continue;
}
if (!this.sitemapParsingProgress.inProgressEntries.has(item.loc)) {
await this.pushNextUrl(item.loc);
this.sitemapParsingProgress.inProgressEntries.add(item.loc);
}
}
}
catch (e) {
this.log.error('Error loading sitemap contents:', e);
}
this.sitemapParsingProgress.pendingSitemapUrls.delete(sitemapUrl);
this.sitemapParsingProgress.inProgressEntries.clear();
this.sitemapParsingProgress.inProgressSitemapUrl = null;
}
this.urlQueueStream.end();
}
/**
* Open a sitemap and start processing it.
*
* Resolves to a new instance of `SitemapRequestList`, which **might not be fully loaded yet** - i.e. the sitemap might still be loading in the background.
*
* Track the loading progress using the `isSitemapFullyLoaded` property.
*/
static async open(options) {
const requestList = new SitemapRequestList({
...options,
persistStateKey: options.persistStateKey ?? STATE_PERSISTENCE_KEY,
});
await requestList.restoreState();
void requestList.load({ parseSitemapOptions: options.parseSitemapOptions });
if (requestList.persistenceOptions.enable) {
requestList.events.on("persistState" /* EventType.PERSIST_STATE */, requestList.persistState);
}
options?.signal?.addEventListener('abort', () => {
requestList.abortLoading = true;
});
if (options.timeoutMillis) {
setTimeout(() => {
requestList.abortLoading = true;
}, options.timeoutMillis);
}
return requestList;
}
/**
* @inheritDoc
*/
length() {
return this.urlQueueStream.readableLength + this.handledUrlCount - this.inProgress.size - this.reclaimed.size;
}
/**
* @inheritDoc
*/
async isFinished() {
return ((await this.isEmpty()) && this.inProgress.size === 0 && (this.isSitemapFullyLoaded() || this.abortLoading));
}
/**
* @inheritDoc
*/
async isEmpty() {
return this.reclaimed.size === 0 && this.urlQueueStream.readableLength === 0;
}
/**
* @inheritDoc
*/
handledCount() {
return this.handledUrlCount;
}
/**
* @inheritDoc
*/
async persistState() {
if (this.persistStateKey === undefined) {
return;
}
this.store ?? (this.store = await key_value_store_1.KeyValueStore.open());
const urlQueue = [];
while (this.urlQueueStream.readableLength > 0) {
const url = this.urlQueueStream.read();
if (url === null) {
break;
}
urlQueue.push(url);
}
// Create a new stream, as we have read all the URLs from the current one.
// Pushing the urls back to the original stream might not be possible if it has been ended.
const newStream = this.createNewStream(this.urlQueueStream.readableHighWaterMark);
for (const url of urlQueue) {
newStream.push(url);
}
if (this.urlQueueStream.writableEnded) {
newStream.end();
}
this.urlQueueStream = newStream;
await this.store.setValue(this.persistStateKey, {
sitemapParsingProgress: {
pendingSitemapUrls: Array.from(this.sitemapParsingProgress.pendingSitemapUrls),
inProgressSitemapUrl: this.sitemapParsingProgress.inProgressSitemapUrl,
inProgressEntries: Array.from(this.sitemapParsingProgress.inProgressEntries),
},
urlQueue,
reclaimed: [...this.inProgress, ...this.reclaimed], // In-progress and reclaimed requests will be both retried if state is restored
requestData: Array.from(this.requestData.entries()),
abortLoading: this.abortLoading,
closed: this.closed,
});
}
async restoreState() {
await (0, utils_2.purgeDefaultStorages)({ onlyPurgeOnce: true });
if (this.persistStateKey === undefined) {
return;
}
this.store ?? (this.store = await key_value_store_1.KeyValueStore.open());
const state = await this.store.getValue(this.persistStateKey);
if (state === null) {
return;
}
this.reclaimed = new Set(state.reclaimed);
this.sitemapParsingProgress = {
pendingSitemapUrls: new Set(state.sitemapParsingProgress.pendingSitemapUrls),
inProgressSitemapUrl: state.sitemapParsingProgress.inProgressSitemapUrl,
inProgressEntries: new Set(state.sitemapParsingProgress.inProgressEntries),
};
this.requestData = new Map(state.requestData ?? []);
for (const url of state.urlQueue) {
this.urlQueueStream.push(url);
}
this.abortLoading = state.abortLoading;
this.closed = state.closed;
}
/**
* @inheritDoc
*/
async fetchNextRequest() {
// Try to return a reclaimed request first
let nextUrl = this.reclaimed.values().next().value;
if (nextUrl) {
this.reclaimed.delete(nextUrl);
}
else {
// Otherwise read next url from the stream
nextUrl = await this.readNextUrl();
if (!nextUrl) {
return null;
}
this.requestData.set(nextUrl, new request_1.Request({ url: nextUrl }));
}
this.inProgress.add(nextUrl);
return this.requestData.get(nextUrl);
}
/**
* @inheritDoc
*/
async *[Symbol.asyncIterator]() {
while (!(await this.isFinished())) {
const request = await this.fetchNextRequest();
if (!request)
break;
yield request;
}
}
/**
* @inheritDoc
*/
async reclaimRequest(request) {
this.ensureInProgressAndNotReclaimed(request.url);
this.reclaimed.add(request.url);
this.inProgress.delete(request.url);
}
/**
* Aborts the internal sitemap loading, stops the processing of the sitemap contents and drops all the pending URLs.
*
* Calling `fetchNextRequest()` after this method will always return `null`.
*/
async teardown() {
this.closed = true;
this.abortLoading = true;
this.events.off("persistState" /* EventType.PERSIST_STATE */, this.persistState);
await this.persistState();
this.urlQueueStream.emit('readdata'); // unblocks the potentially waiting `pushNextUrl` call
}
/**
* @inheritDoc
*/
async markRequestHandled(request) {
this.handledUrlCount += 1;
this.ensureInProgressAndNotReclaimed(request.url);
this.inProgress.delete(request.url);
this.requestData.delete(request.url);
}
ensureInProgressAndNotReclaimed(url) {
if (!this.inProgress.has(url)) {
throw new Error(`The request is not being processed (url: ${url})`);
}
if (this.reclaimed.has(url)) {
throw new Error(`The request was already reclaimed (url: ${url})`);
}
}
}
exports.SitemapRequestList = SitemapRequestList;
//# sourceMappingURL=sitemap_request_list.js.map