UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

179 lines • 6.64 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.URL_WITH_COMMAS_REGEX = exports.URL_NO_COMMAS_REGEX = void 0; exports.isDocker = isDocker; exports.isContainerized = isContainerized; exports.isLambda = isLambda; exports.getCgroupsVersion = getCgroupsVersion; exports.weightedAvg = weightedAvg; exports.sleep = sleep; exports.snakeCaseToCamelCase = snakeCaseToCamelCase; exports.expandShadowRoots = expandShadowRoots; const tslib_1 = require("tslib"); const promises_1 = tslib_1.__importDefault(require("node:fs/promises")); const promises_2 = require("node:timers/promises"); /** * Default regular expression to match URLs in a string that may be plain text, JSON, CSV or other. It supports common URL characters * and does not support URLs containing commas or spaces. The URLs also may contain Unicode letters (not symbols). */ exports.URL_NO_COMMAS_REGEX = /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu; /** * Regular expression that, in addition to the default regular expression `URL_NO_COMMAS_REGEX`, supports matching commas in URL path and query. * Note, however, that this may prevent parsing URLs from comma delimited lists, or the URLs may become malformed. */ exports.URL_WITH_COMMAS_REGEX = /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+,.~#?&/=()]*)?/giu; let isDockerPromiseCache; async function createIsDockerPromise() { const promise1 = promises_1.default .stat('/.dockerenv') .then(() => true) .catch(() => false); const promise2 = promises_1.default .readFile('/proc/self/cgroup', 'utf8') .then((content) => content.includes('docker')) .catch(() => false); const [result1, result2] = await Promise.all([promise1, promise2]); return result1 || result2; } /** * Returns a `Promise` that resolves to true if the code is running in a Docker container. */ async function isDocker(forceReset) { // Parameter forceReset is just internal for unit tests. if (!isDockerPromiseCache || forceReset) isDockerPromiseCache = createIsDockerPromise(); return isDockerPromiseCache; } let isContainerizedResult; /** * Detects if crawlee is running in a containerized environment. */ async function isContainerized() { // Value is very unlikley to change. Cache the result after the first execution. if (isContainerizedResult !== undefined) { return isContainerizedResult; } // return false if running in aws lambda if (isLambda()) { isContainerizedResult = false; return isContainerizedResult; } const dockerenvCheck = promises_1.default .stat('/.dockerenv') .then(() => true) .catch(() => false); const cgroupCheck = promises_1.default .readFile('/proc/self/cgroup', 'utf8') .then((content) => content.includes('docker')) .catch(() => false); const [dockerenvResult, cgroupResult] = await Promise.all([dockerenvCheck, cgroupCheck]); isContainerizedResult = dockerenvResult || cgroupResult || !!process.env.KUBERNETES_SERVICE_HOST; return isContainerizedResult; } function isLambda() { return !!process.env.AWS_LAMBDA_FUNCTION_MEMORY_SIZE; } let _cgroupsVersion; /** * gets the cgroup version by checking for a file at /sys/fs/cgroup/memory * @returns "V1" or "V2" for the version of cgroup or null if cgroup is not found. */ async function getCgroupsVersion(forceReset) { // Parameter forceReset is just internal for unit tests. if (_cgroupsVersion !== undefined && !forceReset) { return _cgroupsVersion; } try { // If this directory does not exists, cgroups are not available await promises_1.default.access('/sys/fs/cgroup/'); } catch (e) { _cgroupsVersion = null; return null; } _cgroupsVersion = 'V1'; try { // If this directory does not exists, assume the container is using cgroups V2 await promises_1.default.access('/sys/fs/cgroup/memory/'); } catch (e) { _cgroupsVersion = 'V2'; } return _cgroupsVersion; } /** * Computes a weighted average of an array of numbers, complemented by an array of weights. * @ignore */ function weightedAvg(arrValues, arrWeights) { const result = arrValues .map((value, i) => { const weight = arrWeights[i]; const sum = value * weight; return [sum, weight]; }) .reduce((p, c) => [p[0] + c[0], p[1] + c[1]], [0, 0]); return result[0] / result[1]; } /** * Returns a `Promise` that resolves after a specific period of time. This is useful to implement waiting * in your code, e.g. to prevent overloading of target website or to avoid bot detection. * * **Example usage:** * * ``` * import { sleep } from 'crawlee'; * * ... * * // Sleep 1.5 seconds * await sleep(1500); * ``` * @param millis Period of time to sleep, in milliseconds. If not a positive number, the returned promise resolves immediately. */ async function sleep(millis) { return (0, promises_2.setTimeout)(millis ?? undefined); } /** * Converts SNAKE_CASE to camelCase. * @ignore */ function snakeCaseToCamelCase(snakeCaseStr) { return snakeCaseStr .toLowerCase() .split('_') .map((part, index) => { return index > 0 ? part.charAt(0).toUpperCase() + part.slice(1) : part; }) .join(''); } /** * Traverses DOM and expands shadow-root elements (created by custom components). * @ignore */ function expandShadowRoots(document) { // Returns HTML of given shadow DOM. function getShadowDomHtml(shadowRoot) { let shadowHTML = ''; for (const el of shadowRoot.childNodes) { shadowHTML += el.nodeValue ?? el.outerHTML ?? ''; } return shadowHTML; } // Recursively replaces shadow DOMs with their HTML. function replaceShadowDomsWithHtml(rootElement) { for (const el of rootElement.querySelectorAll('*')) { if (el.shadowRoot) { replaceShadowDomsWithHtml(el.shadowRoot); let content = el.getHTML?.({ serializableShadowRoots: true }).trim(); if (!(content?.length > 0)) { content = getShadowDomHtml(el.shadowRoot) ?? ''; } el.innerHTML += content; } } } replaceShadowDomsWithHtml(document.body); return document.documentElement.outerHTML; } //# sourceMappingURL=general.js.map