@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
224 lines • 8.04 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.tryAbsoluteURL = void 0;
exports.updateEnqueueLinksPatternCache = updateEnqueueLinksPatternCache;
exports.constructRegExpObjectsFromPseudoUrls = constructRegExpObjectsFromPseudoUrls;
exports.constructGlobObjectsFromGlobs = constructGlobObjectsFromGlobs;
exports.validateGlobPattern = validateGlobPattern;
exports.constructRegExpObjectsFromRegExps = constructRegExpObjectsFromRegExps;
exports.createRequests = createRequests;
exports.filterRequestsByPatterns = filterRequestsByPatterns;
exports.createRequestOptions = createRequestOptions;
const node_url_1 = require("node:url");
const minimatch_1 = require("minimatch");
const pseudo_url_1 = require("@apify/pseudo_url");
const request_1 = require("../request");
var utils_1 = require("@crawlee/utils");
Object.defineProperty(exports, "tryAbsoluteURL", { enumerable: true, get: function () { return utils_1.tryAbsoluteURL; } });
const MAX_ENQUEUE_LINKS_CACHE_SIZE = 1000;
/**
* To enable direct use of the Actor UI `globs`/`regexps`/`pseudoUrls` output while keeping high performance,
* all the regexps from the output are only constructed once and kept in a cache
* by the `enqueueLinks()` function.
* @ignore
*/
const enqueueLinksPatternCache = new Map();
/**
* @ignore
*/
function updateEnqueueLinksPatternCache(item, pattern) {
enqueueLinksPatternCache.set(item, pattern);
if (enqueueLinksPatternCache.size > MAX_ENQUEUE_LINKS_CACHE_SIZE) {
const key = enqueueLinksPatternCache.keys().next().value;
enqueueLinksPatternCache.delete(key);
}
}
/**
* Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function
* to construct RegExps from PseudoUrl strings.
* @ignore
*/
function constructRegExpObjectsFromPseudoUrls(pseudoUrls) {
return pseudoUrls.map((item) => {
// Get pseudoUrl object from cache.
let regexpObject = enqueueLinksPatternCache.get(item);
if (regexpObject)
return regexpObject;
if (typeof item === 'string') {
regexpObject = { regexp: (0, pseudo_url_1.purlToRegExp)(item) };
}
else {
const { purl, ...requestOptions } = item;
regexpObject = { regexp: (0, pseudo_url_1.purlToRegExp)(purl), ...requestOptions };
}
updateEnqueueLinksPatternCache(item, regexpObject);
return regexpObject;
});
}
/**
* Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function
* to construct Glob objects from Glob pattern strings.
* @ignore
*/
function constructGlobObjectsFromGlobs(globs) {
return globs
.filter((glob) => {
// Skip possibly nullish, empty strings
if (!glob) {
return false;
}
if (typeof glob === 'string') {
return glob.trim().length > 0;
}
if (glob.glob) {
return glob.glob.trim().length > 0;
}
return false;
})
.map((item) => {
// Get glob object from cache.
let globObject = enqueueLinksPatternCache.get(item);
if (globObject)
return globObject;
if (typeof item === 'string') {
globObject = { glob: validateGlobPattern(item) };
}
else {
const { glob, ...requestOptions } = item;
globObject = { glob: validateGlobPattern(glob), ...requestOptions };
}
updateEnqueueLinksPatternCache(item, globObject);
return globObject;
});
}
/**
* @internal
*/
function validateGlobPattern(glob) {
const globTrimmed = glob.trim();
if (globTrimmed.length === 0)
throw new Error(`Cannot parse Glob pattern '${globTrimmed}': it must be an non-empty string`);
return globTrimmed;
}
/**
* Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function
* to check RegExps input and return valid RegExps.
* @ignore
*/
function constructRegExpObjectsFromRegExps(regexps) {
return regexps.map((item) => {
// Get regexp object from cache.
let regexpObject = enqueueLinksPatternCache.get(item);
if (regexpObject)
return regexpObject;
if (item instanceof RegExp) {
regexpObject = { regexp: item };
}
else {
regexpObject = item;
}
updateEnqueueLinksPatternCache(item, regexpObject);
return regexpObject;
});
}
/**
* @ignore
*/
function createRequests(requestOptions, urlPatternObjects, excludePatternObjects = [], strategy, onSkippedUrl) {
const excludePatternObjectMatchers = excludePatternObjects.map(createPatternObjectMatcher);
const urlPatternObjectMatchers = urlPatternObjects?.map(createPatternObjectMatcher);
return requestOptions
.map((opts) => ({ url: typeof opts === 'string' ? opts : opts.url, opts }))
.filter(({ url }) => {
const matchesExcludePatterns = excludePatternObjectMatchers.some(({ match }) => match(url));
if (matchesExcludePatterns) {
onSkippedUrl?.(url);
}
return !matchesExcludePatterns;
})
.map(({ url, opts }) => {
if (!urlPatternObjectMatchers || !urlPatternObjectMatchers.length) {
return new request_1.Request(typeof opts === 'string' ? { url: opts, enqueueStrategy: strategy } : { ...opts });
}
for (const urlPatternObject of urlPatternObjectMatchers) {
const { match, glob, regexp, ...requestRegExpOptions } = urlPatternObject;
if (match(url)) {
const request = typeof opts === 'string'
? { url: opts, ...requestRegExpOptions, enqueueStrategy: strategy }
: { ...opts, ...requestRegExpOptions, enqueueStrategy: strategy };
return new request_1.Request(request);
}
}
// didn't match any positive pattern
onSkippedUrl?.(url);
return null;
})
.filter((request) => request);
}
function filterRequestsByPatterns(requests, patterns, onSkippedUrl) {
if (!patterns?.length) {
return requests;
}
const filtered = [];
const patternMatchers = patterns?.map(createPatternObjectMatcher);
for (const request of requests) {
const matchingPattern = patternMatchers.find(({ match }) => match(request.url));
if (matchingPattern !== undefined) {
filtered.push(request);
}
else {
onSkippedUrl?.(request.url);
}
}
return filtered;
}
/**
* @ignore
*/
function createRequestOptions(sources, options = {}) {
return sources
.map((src) => typeof src === 'string'
? { url: src, enqueueStrategy: options.strategy }
: { ...src, enqueueStrategy: options.strategy })
.filter(({ url }) => {
try {
return new node_url_1.URL(url, options.baseUrl).href;
}
catch (err) {
return false;
}
})
.map((requestOptions) => {
requestOptions.url = new node_url_1.URL(requestOptions.url, options.baseUrl).href;
requestOptions.userData ?? (requestOptions.userData = options.userData ?? {});
if (typeof options.label === 'string') {
requestOptions.userData = {
...requestOptions.userData,
label: options.label,
};
}
if (options.skipNavigation) {
requestOptions.skipNavigation = true;
}
return requestOptions;
});
}
/**
* @ignore
*/
function createPatternObjectMatcher(urlPatternObject) {
const { regexp, glob } = urlPatternObject;
let match;
if (regexp) {
match = (url) => regexp.test(url);
}
else if (glob) {
const m = new minimatch_1.Minimatch(glob, { nocase: true });
match = (url) => m.match(url);
}
else {
match = () => false;
}
return { ...urlPatternObject, match };
}
//# sourceMappingURL=shared.js.map