@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
272 lines • 14.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.EnqueueStrategy = void 0;
exports.enqueueLinks = enqueueLinks;
exports.resolveBaseUrlForEnqueueLinksFiltering = resolveBaseUrlForEnqueueLinksFiltering;
const tslib_1 = require("tslib");
const ow_1 = tslib_1.__importDefault(require("ow"));
const tldts_1 = require("tldts");
const log_1 = tslib_1.__importDefault(require("@apify/log"));
const shared_1 = require("./shared");
/**
* The different enqueueing strategies available.
*
* Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name:
*
* ```md
* Protocol Domain
* ┌────┐ ┌─────────┐
* https://example.crawlee.dev/...
* │ └─────────────────┤
* │ Hostname │
* │ │
* └─────────────────────────┘
* Origin
*```
*
* - The `Protocol` is usually `http` or `https`
* - The `Domain` represents the path without any possible subdomains to a website. For example, `crawlee.dev` is the domain of `https://example.crawlee.dev/`
* - The `Hostname` is the full path to a website, including any subdomains. For example, `example.crawlee.dev` is the hostname of `https://example.crawlee.dev/`
* - The `Origin` is the combination of the `Protocol` and `Hostname`. For example, `https://example.crawlee.dev` is the origin of `https://example.crawlee.dev/`
*/
var EnqueueStrategy;
(function (EnqueueStrategy) {
/**
* Matches any URLs found
*/
EnqueueStrategy["All"] = "all";
/**
* Matches any URLs that have the same hostname.
* For example, `https://wow.example.com/hello` will be matched for a base url of `https://wow.example.com/`, but
* `https://example.com/hello` will not be matched.
*
* > This strategy will match both `http` and `https` protocols regardless of the base URL protocol.
*/
EnqueueStrategy["SameHostname"] = "same-hostname";
/**
* Matches any URLs that have the same domain as the base URL.
* For example, `https://wow.an.example.com` and `https://example.com` will both be matched for a base url of
* `https://example.com`.
*
* > This strategy will match both `http` and `https` protocols regardless of the base URL protocol.
*/
EnqueueStrategy["SameDomain"] = "same-domain";
/**
* Matches any URLs that have the same hostname and protocol.
* For example, `https://wow.example.com/hello` will be matched for a base url of `https://wow.example.com/`, but
* `http://wow.example.com/hello` will not be matched.
*
* > This strategy will ensure the protocol of the base URL is the same as the protocol of the URL to be enqueued.
*/
EnqueueStrategy["SameOrigin"] = "same-origin";
})(EnqueueStrategy || (exports.EnqueueStrategy = EnqueueStrategy = {}));
/**
* This function enqueues the urls provided to the {@link RequestQueue} provided. If you want to automatically find and enqueue links,
* you should use the context-aware `enqueueLinks` function provided on the crawler contexts.
*
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
* and override settings of the enqueued {@link Request} objects.
*
* **Example usage**
*
* ```javascript
* await enqueueLinks({
* urls: aListOfFoundUrls,
* requestQueue,
* selector: 'a.product-detail',
* globs: [
* 'https://www.example.com/handbags/*',
* 'https://www.example.com/purses/*'
* ],
* });
* ```
*
* @param options All `enqueueLinks()` parameters are passed via an options object.
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
*/
async function enqueueLinks(options) {
if (!options || Object.keys(options).length === 0) {
throw new RangeError([
'enqueueLinks() was called without the required options. You can only do that when you use the `crawlingContext.enqueueLinks()` method in request handlers.',
'Check out our guide on how to use enqueueLinks() here: https://crawlee.dev/js/docs/examples/crawl-relative-links',
].join('\n'));
}
(0, ow_1.default)(options, ow_1.default.object.exactShape({
urls: ow_1.default.array.ofType(ow_1.default.string),
requestQueue: ow_1.default.object.hasKeys('addRequestsBatched'),
robotsTxtFile: ow_1.default.optional.object.hasKeys('isAllowed'),
onSkippedRequest: ow_1.default.optional.function,
forefront: ow_1.default.optional.boolean,
skipNavigation: ow_1.default.optional.boolean,
limit: ow_1.default.optional.number,
selector: ow_1.default.optional.string,
baseUrl: ow_1.default.optional.string,
userData: ow_1.default.optional.object,
label: ow_1.default.optional.string,
pseudoUrls: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.hasKeys('purl'))),
globs: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.hasKeys('glob'))),
exclude: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.regExp, ow_1.default.object.hasKeys('glob'), ow_1.default.object.hasKeys('regexp'))),
regexps: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.regExp, ow_1.default.object.hasKeys('regexp'))),
transformRequestFunction: ow_1.default.optional.function,
strategy: ow_1.default.optional.string.oneOf(Object.values(EnqueueStrategy)),
waitForAllRequestsToBeAdded: ow_1.default.optional.boolean,
}));
const { requestQueue, limit, urls, pseudoUrls, exclude, globs, regexps, transformRequestFunction, forefront, waitForAllRequestsToBeAdded, robotsTxtFile, onSkippedRequest, } = options;
const urlExcludePatternObjects = [];
const urlPatternObjects = [];
if (exclude?.length) {
for (const excl of exclude) {
if (typeof excl === 'string' || 'glob' in excl) {
urlExcludePatternObjects.push(...(0, shared_1.constructGlobObjectsFromGlobs)([excl]));
}
else if (excl instanceof RegExp || 'regexp' in excl) {
urlExcludePatternObjects.push(...(0, shared_1.constructRegExpObjectsFromRegExps)([excl]));
}
}
}
if (pseudoUrls?.length) {
log_1.default.deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead');
urlPatternObjects.push(...(0, shared_1.constructRegExpObjectsFromPseudoUrls)(pseudoUrls));
}
if (globs?.length) {
urlPatternObjects.push(...(0, shared_1.constructGlobObjectsFromGlobs)(globs));
}
if (regexps?.length) {
urlPatternObjects.push(...(0, shared_1.constructRegExpObjectsFromRegExps)(regexps));
}
if (!urlPatternObjects.length) {
options.strategy ?? (options.strategy = EnqueueStrategy.SameHostname);
}
const enqueueStrategyPatterns = [];
if (options.baseUrl) {
const url = new URL(options.baseUrl);
switch (options.strategy) {
case EnqueueStrategy.SameHostname:
// We need to get the origin of the passed in domain in the event someone sets baseUrl
// to an url like https://example.com/deep/default/path and one of the found urls is an
// absolute relative path (/path/to/page)
enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) });
break;
case EnqueueStrategy.SameDomain: {
// Get the actual hostname from the base url
const baseUrlHostname = (0, tldts_1.getDomain)(url.hostname, { mixedInputs: false });
if (baseUrlHostname) {
// We have a hostname, so we can use it to match all links on the page that point to it and any subdomains of it
url.hostname = baseUrlHostname;
enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin.replace(baseUrlHostname, `*.${baseUrlHostname}`)}/**`) }, { glob: ignoreHttpSchema(`${url.origin}/**`) });
}
else {
// We don't have a hostname (can happen for ips for instance), so reproduce the same behavior
// as SameDomainAndSubdomain
enqueueStrategyPatterns.push({ glob: ignoreHttpSchema(`${url.origin}/**`) });
}
break;
}
case EnqueueStrategy.SameOrigin: {
// The same behavior as SameHostname, but respecting the protocol of the URL
enqueueStrategyPatterns.push({ glob: `${url.origin}/**` });
break;
}
case EnqueueStrategy.All:
default:
enqueueStrategyPatterns.push({ glob: `http{s,}://**` });
break;
}
}
async function reportSkippedRequests(skippedRequests, reason) {
if (onSkippedRequest && skippedRequests.length > 0) {
await Promise.all(skippedRequests.map((request) => {
return onSkippedRequest({ url: request.url, reason: request.skippedReason ?? reason });
}));
}
}
let requestOptions = (0, shared_1.createRequestOptions)(urls, options);
if (robotsTxtFile) {
const skippedRequests = [];
requestOptions = requestOptions.filter((request) => {
if (robotsTxtFile.isAllowed(request.url)) {
return true;
}
skippedRequests.push(request);
return false;
});
await reportSkippedRequests(skippedRequests, 'robotsTxt');
}
if (transformRequestFunction) {
const skippedRequests = [];
requestOptions = requestOptions
.map((request) => {
const transformedRequest = transformRequestFunction(request);
if (!transformedRequest) {
skippedRequests.push(request);
}
return transformedRequest;
})
.filter((r) => Boolean(r));
await reportSkippedRequests(skippedRequests, 'filters');
}
async function createFilteredRequests() {
const skippedRequests = [];
// No user provided patterns means we can skip an extra filtering step
if (urlPatternObjects.length === 0) {
return (0, shared_1.createRequests)(requestOptions, enqueueStrategyPatterns, urlExcludePatternObjects, options.strategy, (url) => skippedRequests.push(url));
}
// Generate requests based on the user patterns first
const generatedRequestsFromUserFilters = (0, shared_1.createRequests)(requestOptions, urlPatternObjects, urlExcludePatternObjects, options.strategy, (url) => skippedRequests.push(url));
// ...then filter them by the enqueue links strategy (making this an AND check)
const filtered = (0, shared_1.filterRequestsByPatterns)(generatedRequestsFromUserFilters, enqueueStrategyPatterns, (url) => skippedRequests.push(url));
await reportSkippedRequests(skippedRequests.map((url) => ({ url })), 'filters');
return filtered;
}
let requests = await createFilteredRequests();
if (typeof limit === 'number' && limit < requests.length) {
await reportSkippedRequests(requests.slice(limit), 'limit');
requests = requests.slice(0, limit);
}
const { addedRequests } = await requestQueue.addRequestsBatched(requests, {
forefront,
waitForAllRequestsToBeAdded,
});
return { processedRequests: addedRequests, unprocessedRequests: [] };
}
/**
* @internal
* This method helps resolve the baseUrl that will be used for filtering in {@link enqueueLinks}.
* - If a user provides a base url, we always return it
* - If a user specifies {@link EnqueueStrategy.All} strategy, they do not care if the newly found urls are on the original
* request domain, or a redirected one
* - In all other cases, we return the domain of the original request as that's the one we need to use for filtering
*/
function resolveBaseUrlForEnqueueLinksFiltering({ enqueueStrategy, finalRequestUrl, originalRequestUrl, userProvidedBaseUrl, }) {
// User provided base url takes priority
if (userProvidedBaseUrl) {
return userProvidedBaseUrl;
}
const originalUrlOrigin = new URL(originalRequestUrl).origin;
const finalUrlOrigin = new URL(finalRequestUrl ?? originalRequestUrl).origin;
// We can assume users want to go off the domain in this case
if (enqueueStrategy === EnqueueStrategy.All) {
return finalUrlOrigin;
}
// If the user wants to ensure the same domain is accessed, regardless of subdomains, we check to ensure the domains match
// Returning undefined here is intentional! If the domains don't match, having no baseUrl in enqueueLinks will cause it to not enqueue anything
// which is the intended behavior (since we went off domain)
if (enqueueStrategy === EnqueueStrategy.SameDomain) {
const originalHostname = (0, tldts_1.getDomain)(originalUrlOrigin, { mixedInputs: false });
const finalHostname = (0, tldts_1.getDomain)(finalUrlOrigin, { mixedInputs: false });
if (originalHostname === finalHostname) {
return finalUrlOrigin;
}
return undefined;
}
// Always enqueue urls that are from the same origin in all other cases, as the filtering happens on the original request url, even if there was a redirect
// before actually finding the urls
return originalUrlOrigin;
}
/**
* Internal function that changes the enqueue globs to match both http and https
*/
function ignoreHttpSchema(pattern) {
return pattern.replace(/^(https?):\/\//, 'http{s,}://');
}
//# sourceMappingURL=enqueue_links.js.map