@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
179 lines • 6.64 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.URL_WITH_COMMAS_REGEX = exports.URL_NO_COMMAS_REGEX = void 0;
exports.isDocker = isDocker;
exports.isContainerized = isContainerized;
exports.isLambda = isLambda;
exports.getCgroupsVersion = getCgroupsVersion;
exports.weightedAvg = weightedAvg;
exports.sleep = sleep;
exports.snakeCaseToCamelCase = snakeCaseToCamelCase;
exports.expandShadowRoots = expandShadowRoots;
const tslib_1 = require("tslib");
const promises_1 = tslib_1.__importDefault(require("node:fs/promises"));
const promises_2 = require("node:timers/promises");
/**
* Default regular expression to match URLs in a string that may be plain text, JSON, CSV or other. It supports common URL characters
* and does not support URLs containing commas or spaces. The URLs also may contain Unicode letters (not symbols).
*/
exports.URL_NO_COMMAS_REGEX = /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?/giu;
/**
* Regular expression that, in addition to the default regular expression `URL_NO_COMMAS_REGEX`, supports matching commas in URL path and query.
* Note, however, that this may prevent parsing URLs from comma delimited lists, or the URLs may become malformed.
*/
exports.URL_WITH_COMMAS_REGEX = /https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+,.~#?&/=()]*)?/giu;
let isDockerPromiseCache;
async function createIsDockerPromise() {
const promise1 = promises_1.default
.stat('/.dockerenv')
.then(() => true)
.catch(() => false);
const promise2 = promises_1.default
.readFile('/proc/self/cgroup', 'utf8')
.then((content) => content.includes('docker'))
.catch(() => false);
const [result1, result2] = await Promise.all([promise1, promise2]);
return result1 || result2;
}
/**
* Returns a `Promise` that resolves to true if the code is running in a Docker container.
*/
async function isDocker(forceReset) {
// Parameter forceReset is just internal for unit tests.
if (!isDockerPromiseCache || forceReset)
isDockerPromiseCache = createIsDockerPromise();
return isDockerPromiseCache;
}
let isContainerizedResult;
/**
* Detects if crawlee is running in a containerized environment.
*/
async function isContainerized() {
// Value is very unlikley to change. Cache the result after the first execution.
if (isContainerizedResult !== undefined) {
return isContainerizedResult;
}
// return false if running in aws lambda
if (isLambda()) {
isContainerizedResult = false;
return isContainerizedResult;
}
const dockerenvCheck = promises_1.default
.stat('/.dockerenv')
.then(() => true)
.catch(() => false);
const cgroupCheck = promises_1.default
.readFile('/proc/self/cgroup', 'utf8')
.then((content) => content.includes('docker'))
.catch(() => false);
const [dockerenvResult, cgroupResult] = await Promise.all([dockerenvCheck, cgroupCheck]);
isContainerizedResult = dockerenvResult || cgroupResult || !!process.env.KUBERNETES_SERVICE_HOST;
return isContainerizedResult;
}
function isLambda() {
return !!process.env.AWS_LAMBDA_FUNCTION_MEMORY_SIZE;
}
let _cgroupsVersion;
/**
* gets the cgroup version by checking for a file at /sys/fs/cgroup/memory
* @returns "V1" or "V2" for the version of cgroup or null if cgroup is not found.
*/
async function getCgroupsVersion(forceReset) {
// Parameter forceReset is just internal for unit tests.
if (_cgroupsVersion !== undefined && !forceReset) {
return _cgroupsVersion;
}
try {
// If this directory does not exists, cgroups are not available
await promises_1.default.access('/sys/fs/cgroup/');
}
catch (e) {
_cgroupsVersion = null;
return null;
}
_cgroupsVersion = 'V1';
try {
// If this directory does not exists, assume the container is using cgroups V2
await promises_1.default.access('/sys/fs/cgroup/memory/');
}
catch (e) {
_cgroupsVersion = 'V2';
}
return _cgroupsVersion;
}
/**
* Computes a weighted average of an array of numbers, complemented by an array of weights.
* @ignore
*/
function weightedAvg(arrValues, arrWeights) {
const result = arrValues
.map((value, i) => {
const weight = arrWeights[i];
const sum = value * weight;
return [sum, weight];
})
.reduce((p, c) => [p[0] + c[0], p[1] + c[1]], [0, 0]);
return result[0] / result[1];
}
/**
* Returns a `Promise` that resolves after a specific period of time. This is useful to implement waiting
* in your code, e.g. to prevent overloading of target website or to avoid bot detection.
*
* **Example usage:**
*
* ```
* import { sleep } from 'crawlee';
*
* ...
*
* // Sleep 1.5 seconds
* await sleep(1500);
* ```
* @param millis Period of time to sleep, in milliseconds. If not a positive number, the returned promise resolves immediately.
*/
async function sleep(millis) {
return (0, promises_2.setTimeout)(millis ?? undefined);
}
/**
* Converts SNAKE_CASE to camelCase.
* @ignore
*/
function snakeCaseToCamelCase(snakeCaseStr) {
return snakeCaseStr
.toLowerCase()
.split('_')
.map((part, index) => {
return index > 0 ? part.charAt(0).toUpperCase() + part.slice(1) : part;
})
.join('');
}
/**
* Traverses DOM and expands shadow-root elements (created by custom components).
* @ignore
*/
function expandShadowRoots(document) {
// Returns HTML of given shadow DOM.
function getShadowDomHtml(shadowRoot) {
let shadowHTML = '';
for (const el of shadowRoot.childNodes) {
shadowHTML += el.nodeValue ?? el.outerHTML ?? '';
}
return shadowHTML;
}
// Recursively replaces shadow DOMs with their HTML.
function replaceShadowDomsWithHtml(rootElement) {
for (const el of rootElement.querySelectorAll('*')) {
if (el.shadowRoot) {
replaceShadowDomsWithHtml(el.shadowRoot);
let content = el.getHTML?.({ serializableShadowRoots: true }).trim();
if (!(content?.length > 0)) {
content = getShadowDomHtml(el.shadowRoot) ?? '';
}
el.innerHTML += content;
}
}
}
replaceShadowDomsWithHtml(document.body);
return document.documentElement.outerHTML;
}
//# sourceMappingURL=general.js.map