@crawlee/playwright
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
589 lines • 25.9 kB
JavaScript
"use strict";
/**
* A namespace that contains various utilities for
* [Playwright](https://github.com/microsoft/playwright) - the headless Chrome Node API.
*
* **Example usage:**
*
* ```javascript
* import { launchPlaywright, playwrightUtils } from 'crawlee';
*
* // Navigate to https://www.example.com in Playwright with a POST request
* const browser = await launchPlaywright();
* const page = await browser.newPage();
* await playwrightUtils.gotoExtended(page, {
* url: 'https://example.com,
* method: 'POST',
* });
* ```
* @module playwrightUtils
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.playwrightUtils = exports.enqueueLinksByClickingElements = void 0;
exports.injectFile = injectFile;
exports.injectJQuery = injectJQuery;
exports.gotoExtended = gotoExtended;
exports.blockRequests = blockRequests;
exports.compileScript = compileScript;
exports.infiniteScroll = infiniteScroll;
exports.saveSnapshot = saveSnapshot;
exports.parseWithCheerio = parseWithCheerio;
exports.closeCookieModals = closeCookieModals;
exports.registerUtilsToContext = registerUtilsToContext;
const tslib_1 = require("tslib");
const promises_1 = require("node:fs/promises");
const node_vm_1 = tslib_1.__importDefault(require("node:vm"));
const browser_1 = require("@crawlee/browser");
const utils_1 = require("@crawlee/utils");
const cheerio = tslib_1.__importStar(require("cheerio"));
const idcac_playwright_1 = require("idcac-playwright");
const ow_1 = tslib_1.__importDefault(require("ow"));
const datastructures_1 = require("@apify/datastructures");
const log_1 = tslib_1.__importDefault(require("@apify/log"));
const click_elements_1 = require("../enqueue-links/click-elements");
Object.defineProperty(exports, "enqueueLinksByClickingElements", { enumerable: true, get: function () { return click_elements_1.enqueueLinksByClickingElements; } });
const rendering_type_prediction_1 = require("./rendering-type-prediction");
const log = log_1.default.child({ prefix: 'Playwright Utils' });
const jqueryPath = require.resolve('jquery');
const MAX_INJECT_FILE_CACHE_SIZE = 10;
const DEFAULT_BLOCK_REQUEST_URL_PATTERNS = ['.css', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip'];
/**
* Cache contents of previously injected files to limit file system access.
*/
const injectedFilesCache = new datastructures_1.LruCache({ maxLength: MAX_INJECT_FILE_CACHE_SIZE });
/**
* Injects a JavaScript file into a Playwright page.
* Unlike Playwright's `addScriptTag` function, this function works on pages
* with arbitrary Cross-Origin Resource Sharing (CORS) policies.
*
* File contents are cached for up to 10 files to limit file system access.
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param filePath File path
* @param [options]
*/
async function injectFile(page, filePath, options = {}) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
(0, ow_1.default)(filePath, ow_1.default.string);
(0, ow_1.default)(options, ow_1.default.object.exactShape({
surviveNavigations: ow_1.default.optional.boolean,
}));
let contents = injectedFilesCache.get(filePath);
if (!contents) {
contents = await (0, promises_1.readFile)(filePath, 'utf8');
injectedFilesCache.add(filePath, contents);
}
const evalP = page.evaluate(contents);
if (options.surviveNavigations) {
page.on('framenavigated', async () => page
.evaluate(contents)
.catch((error) => log.warning('An error occurred during the script injection!', { error })));
}
return evalP;
}
/**
* Injects the [jQuery](https://jquery.com/) library into a Playwright page.
* jQuery is often useful for various web scraping and crawling tasks.
* For example, it can help extract text from HTML elements using CSS selectors.
*
* Beware that the injected jQuery object will be set to the `window.$` variable and thus it might cause conflicts with
* other libraries included by the page that use the same variable name (e.g. another version of jQuery).
* This can affect functionality of page's scripts.
*
* The injected jQuery will survive page navigations and reloads by default.
*
* **Example usage:**
* ```javascript
* await playwrightUtils.injectJQuery(page);
* const title = await page.evaluate(() => {
* return $('head title').text();
* });
* ```
*
* Note that `injectJQuery()` does not affect the Playwright
* [`page.$()`](https://playwright.dev/docs/api/class-page#page-query-selector)
* function in any way.
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options.surviveNavigations] Opt-out option to disable the JQuery reinjection after navigation.
*/
async function injectJQuery(page, options) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
return injectFile(page, jqueryPath, { surviveNavigations: options?.surviveNavigations ?? true });
}
/**
* Extended version of Playwright's `page.goto()` allowing to perform requests with HTTP method other than GET,
* with custom headers and POST payload. URL, method, headers and payload are taken from
* request parameter that must be an instance of Request class.
*
* *NOTE:* In recent versions of Playwright using requests other than GET, overriding headers and adding payloads disables
* browser cache which degrades performance.
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param request
* @param [gotoOptions] Custom options for `page.goto()`.
*/
async function gotoExtended(page, request, gotoOptions = {}) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
(0, ow_1.default)(request, ow_1.default.object.partialShape({
url: ow_1.default.string.url,
method: ow_1.default.optional.string,
headers: ow_1.default.optional.object,
payload: ow_1.default.optional.any(ow_1.default.string, ow_1.default.uint8Array),
}));
(0, ow_1.default)(gotoOptions, ow_1.default.object);
const { url, method, headers, payload } = request;
const isEmpty = (o) => !o || Object.keys(o).length === 0;
if (method !== 'GET' || payload || !isEmpty(headers)) {
// This is not deprecated, we use it to log only once.
log.deprecated('Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance ' +
'in recent versions of Playwright. Use only when necessary.');
let wasCalled = false;
const interceptRequestHandler = async (route) => {
try {
// We want to ensure that this won't get executed again in a case that there is a subsequent request
// for example for some asset file link from main HTML.
if (wasCalled) {
return await route.continue();
}
wasCalled = true;
const overrides = {};
if (method !== 'GET')
overrides.method = method;
if (payload)
overrides.postData = payload;
if (!isEmpty(headers))
overrides.headers = headers;
await route.continue(overrides);
}
catch (error) {
log.debug('Error inside request interceptor', { error });
}
return undefined;
};
await page.route('**/*', interceptRequestHandler);
}
return page.goto(url, gotoOptions);
}
/**
* > This is a **Chromium-only feature.**
* >
* > Using this option with Firefox and WebKit browsers doesn't have any effect.
* > To set up request blocking for these browsers, use `page.route()` instead.
*
* Forces the Playwright browser tab to block loading URLs that match a provided pattern.
* This is useful to speed up crawling of websites, since it reduces the amount
* of data that needs to be downloaded from the web, but it may break some websites
* or unexpectedly prevent loading of resources.
*
* By default, the function will block all URLs including the following patterns:
*
* ```json
* [".css", ".jpg", ".jpeg", ".png", ".svg", ".gif", ".woff", ".pdf", ".zip"]
* ```
*
* If you want to extend this list further, use the `extraUrlPatterns` option,
* which will keep blocking the default patterns, as well as add your custom ones.
* If you would like to block only specific patterns, use the `urlPatterns` option,
* which will override the defaults and block only URLs with your custom patterns.
*
* This function does not use Playwright's request interception and therefore does not interfere
* with browser cache. It's also faster than blocking requests using interception,
* because the blocking happens directly in the browser without the round-trip to Node.js,
* but it does not provide the extra benefits of request interception.
*
* The function will never block main document loads and their respective redirects.
*
* **Example usage**
* ```javascript
* import { launchPlaywright, playwrightUtils } from 'crawlee';
*
* const browser = await launchPlaywright();
* const page = await browser.newPage();
*
* // Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
* await playwrightUtils.blockRequests(page, {
* extraUrlPatterns: ['adsbygoogle.js'],
* });
*
* await page.goto('https://cnn.com');
* ```
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options]
*/
async function blockRequests(page, options = {}) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
(0, ow_1.default)(options, ow_1.default.object.exactShape({
urlPatterns: ow_1.default.optional.array.ofType(ow_1.default.string),
extraUrlPatterns: ow_1.default.optional.array.ofType(ow_1.default.string),
}));
const { urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, extraUrlPatterns = [] } = options;
const patternsToBlock = [...urlPatterns, ...extraUrlPatterns];
try {
const client = await page.context().newCDPSession(page);
await client.send('Network.enable');
await client.send('Network.setBlockedURLs', { urls: patternsToBlock });
}
catch {
log.warning('blockRequests() helper is incompatible with non-Chromium browsers.');
}
}
/**
* Compiles a Playwright script into an async function that may be executed at any time
* by providing it with the following object:
* ```
* {
* page: Page,
* request: Request,
* }
* ```
* Where `page` is a Playwright [`Page`](https://playwright.dev/docs/api/class-page)
* and `request` is a {@apilink Request}.
*
* The function is compiled by using the `scriptString` parameter as the function's body,
* so any limitations to function bodies apply. Return value of the compiled function
* is the return value of the function body = the `scriptString` parameter.
*
* As a security measure, no globals such as `process` or `require` are accessible
* from within the function body. Note that the function does not provide a safe
* sandbox and even though globals are not easily accessible, malicious code may
* still execute in the main process via prototype manipulation. Therefore you
* should only use this function to execute sanitized or safe code.
*
* Custom context may also be provided using the `context` parameter. To improve security,
* make sure to only pass the really necessary objects to the context. Preferably making
* secured copies beforehand.
*/
function compileScript(scriptString, context = Object.create(null)) {
const funcString = `async ({ page, request }) => {${scriptString}}`;
let func;
try {
func = node_vm_1.default.runInNewContext(funcString, context); // "Secure" the context by removing prototypes, unless custom context is provided.
}
catch (err) {
log.exception(err, 'Cannot compile script!');
throw err;
}
if (typeof func !== 'function')
throw new Error('Compilation result is not a function!'); // This should not happen...
return func;
}
/**
* Scrolls to the bottom of a page, or until it times out.
* Loads dynamic content when it hits the bottom of a page, and then continues scrolling.
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options]
*/
async function infiniteScroll(page, options = {}) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
(0, ow_1.default)(options, ow_1.default.object.exactShape({
timeoutSecs: ow_1.default.optional.number,
maxScrollHeight: ow_1.default.optional.number,
waitForSecs: ow_1.default.optional.number,
scrollDownAndUp: ow_1.default.optional.boolean,
buttonSelector: ow_1.default.optional.string,
stopScrollCallback: ow_1.default.optional.function,
}));
const { timeoutSecs = 0, maxScrollHeight = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback, } = options;
let finished;
const startTime = Date.now();
const CHECK_INTERVAL_MILLIS = 1000;
const SCROLL_HEIGHT_IF_ZERO = 10000;
let scrolledDistance = 0;
const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other'];
const resourcesStats = {
newRequested: 0,
oldRequested: 0,
matchNumber: 0,
};
page.on('request', (msg) => {
if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType())) {
resourcesStats.newRequested++;
}
});
const checkFinished = setInterval(() => {
if (resourcesStats.oldRequested === resourcesStats.newRequested) {
resourcesStats.matchNumber++;
if (resourcesStats.matchNumber >= waitForSecs) {
clearInterval(checkFinished);
finished = true;
return;
}
}
else {
resourcesStats.matchNumber = 0;
resourcesStats.oldRequested = resourcesStats.newRequested;
}
// check if timeout has been reached
if (timeoutSecs !== 0 && (Date.now() - startTime) / 1000 > timeoutSecs) {
clearInterval(checkFinished);
finished = true;
}
// check if max scroll height has been reached
if (maxScrollHeight > 0 && scrolledDistance >= maxScrollHeight) {
clearInterval(checkFinished);
finished = true;
}
}, CHECK_INTERVAL_MILLIS);
const doScroll = async () => {
const bodyScrollHeight = await page.evaluate(() => document.body.scrollHeight);
const delta = bodyScrollHeight === 0 ? SCROLL_HEIGHT_IF_ZERO : bodyScrollHeight;
await page.mouse.wheel(0, delta);
scrolledDistance += delta;
};
const maybeClickButton = async () => {
const button = await page.$(buttonSelector);
// Box model returns null if the button is not visible
if (button && (await button.boundingBox())) {
await button.click({ delay: 10 });
}
};
while (!finished) {
await doScroll();
await page.waitForTimeout(250);
if (scrollDownAndUp) {
await page.mouse.wheel(0, -100);
}
if (buttonSelector) {
await maybeClickButton();
}
if (stopScrollCallback) {
if (await stopScrollCallback()) {
clearInterval(checkFinished);
break;
}
}
}
}
/**
* Saves a full screenshot and HTML of the current page into a Key-Value store.
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options]
*/
async function saveSnapshot(page, options = {}) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
(0, ow_1.default)(options, ow_1.default.object.exactShape({
key: ow_1.default.optional.string.nonEmpty,
screenshotQuality: ow_1.default.optional.number,
saveScreenshot: ow_1.default.optional.boolean,
saveHtml: ow_1.default.optional.boolean,
keyValueStoreName: ow_1.default.optional.string,
config: ow_1.default.optional.object,
}));
const { key = 'SNAPSHOT', screenshotQuality = 50, saveScreenshot = true, saveHtml = true, keyValueStoreName, config, } = options;
try {
const store = await browser_1.KeyValueStore.open(keyValueStoreName, {
config: config ?? browser_1.Configuration.getGlobalConfig(),
});
if (saveScreenshot) {
const screenshotName = `${key}.jpg`;
const screenshotBuffer = await page.screenshot({
fullPage: true,
quality: screenshotQuality,
type: 'jpeg',
animations: 'disabled',
});
await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' });
}
if (saveHtml) {
const htmlName = `${key}.html`;
const html = await page.content();
await store.setValue(htmlName, html, { contentType: 'text/html' });
}
}
catch (err) {
throw new Error(`saveSnapshot with key ${key} failed.\nCause:${err.message}`);
}
}
/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}.
*
* **Example usage:**
* ```javascript
* const $ = await playwrightUtils.parseWithCheerio(page);
* const title = $('title').text();
* ```
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param ignoreShadowRoots
*/
async function parseWithCheerio(page, ignoreShadowRoots = false, ignoreIframes = false) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
if (page.frames().length > 1 && !ignoreIframes) {
const frames = await page.$$('iframe');
await Promise.all(frames.map(async (frame) => {
try {
const iframe = await frame.contentFrame();
if (iframe) {
const contents = await iframe.content();
await frame.evaluate((f, c) => {
const replacementNode = document.createElement('div');
replacementNode.innerHTML = c;
replacementNode.className = 'crawlee-iframe-replacement';
f.replaceWith(replacementNode);
}, contents);
}
}
catch (error) {
log.warning(`Failed to extract iframe content: ${error}`);
}
}));
}
const html = ignoreShadowRoots
? null
: (await page.evaluate(`(${utils_1.expandShadowRoots.toString()})(document)`));
const pageContent = html || (await page.content());
return cheerio.load(pageContent);
}
async function closeCookieModals(page) {
(0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage));
await page.evaluate((0, idcac_playwright_1.getInjectableScript)());
}
/**
* This helper tries to solve the Cloudflare challenge automatically by clicking on the checkbox.
* It will try to detect the Cloudflare page, click on the checkbox, and wait for 10 seconds (configurable
* via `sleepSecs` option) for the page to load. Use this in the `postNavigationHooks`, a failures will
* result in a SessionError which will be automatically retried, so only successful requests will get
* into the `requestHandler`.
*
* Works best with camoufox.
*
* **Example usage**
* ```ts
* postNavigationHooks: [
* async ({ handleCloudflareChallenge }) => {
* await handleCloudflareChallenge();
* },
* ],
* ```
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object
* @param url current URL for request identification, only used for logging
* @param [session] current session object
* @param [options]
*/
async function handleCloudflareChallenge(page, url, session, options = {}) {
// eslint-disable-next-line dot-notation
const blockedStatusCodes = session?.['sessionPool']['blockedStatusCodes'];
// Cloudflare pages are 403, which are blocked by default
if (blockedStatusCodes?.includes(403)) {
const idx = blockedStatusCodes.indexOf(403);
blockedStatusCodes.splice(idx, 1);
}
options.isBlockedCallback ?? (options.isBlockedCallback = async () => {
const isBlocked = await page.evaluate(() => {
return document.querySelector('h1')?.textContent?.trim().includes('Sorry, you have been blocked');
});
return !!isBlocked;
});
options.isChallengeCallback ?? (options.isChallengeCallback = async () => {
return await page.evaluate(async () => {
return !!document.querySelector('.footer > .footer-inner > .diagnostic-wrapper > .ray-id');
});
});
const retryBlocked = async () => {
const isBlocked = await options.isBlockedCallback(page).catch(() => false);
if (isBlocked) {
throw new browser_1.SessionError(`Blocked by Cloudflare when processing ${url}`);
}
};
// check if we ended up on the CF challenge page
const isChallenge = async () => {
return options.isChallengeCallback(page).catch(() => false);
};
if (!(await isChallenge())) {
await retryBlocked();
return;
}
const logLevel = options.verbose ? 'info' : 'debug';
log[logLevel](`Detected Cloudflare challenge at ${url}, trying to solve it. This can take up to ${10 + (options.sleepSecs ?? 10)} seconds.`);
const bb = await page
.evaluate(() => {
const div = document.querySelector('.main-content div');
return div?.getBoundingClientRect();
})
.catch(() => undefined);
if (!bb) {
return;
}
const randomOffset = (range) => {
return Math.round(100 * range * Math.random()) / 100;
};
const x = bb.x + 30;
const y = bb.y + 25;
// try to click the checkbox every second
for (let i = 0; i < 10; i++) {
await (0, utils_1.sleep)(1000);
// break early if we are no longer on the CF challenge page
if (!(await isChallenge())) {
break;
}
if (options.clickCallback) {
await options.clickCallback(page, { x, y });
continue;
}
// we can click on the text too, so X can be a bit larger
const xRandomized = x + randomOffset(10);
const yRandomized = y + randomOffset(10);
log[logLevel](`Trying to click on the Cloudflare checkbox at ${url}`, { x: xRandomized, y: yRandomized });
await page.mouse.click(xRandomized, yRandomized);
// sometimes the checkbox is lower (could be caused by a lag when rendering the logo)
await page.mouse.click(xRandomized, yRandomized + 35);
}
await (0, utils_1.sleep)((options.sleepSecs ?? 10) * 1000);
if (await isChallenge()) {
throw new browser_1.SessionError(`Blocked by Cloudflare when processing ${url}`);
}
await retryBlocked();
}
function registerUtilsToContext(context, crawlerOptions) {
context.injectFile = async (filePath, options) => injectFile(context.page, filePath, options);
context.injectJQuery = async () => {
if (context.request.state === browser_1.RequestState.BEFORE_NAV) {
log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.');
await injectJQuery(context.page);
return;
}
await injectJQuery(context.page, { surviveNavigations: false });
};
context.blockRequests = async (options) => blockRequests(context.page, options);
context.waitForSelector = async (selector, timeoutMs = 5000) => {
const locator = context.page.locator(selector).first();
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
};
context.parseWithCheerio = async (selector, timeoutMs = 5000) => {
if (selector) {
await context.waitForSelector(selector, timeoutMs);
}
return parseWithCheerio(context.page, crawlerOptions.ignoreShadowRoots, crawlerOptions.ignoreIframes);
};
context.infiniteScroll = async (options) => infiniteScroll(context.page, options);
context.saveSnapshot = async (options) => saveSnapshot(context.page, { ...options, config: context.crawler.config });
context.enqueueLinksByClickingElements = async (options) => (0, click_elements_1.enqueueLinksByClickingElements)({
...options,
page: context.page,
requestQueue: context.crawler.requestQueue,
});
context.compileScript = (scriptString, ctx) => compileScript(scriptString, ctx);
context.closeCookieModals = async () => closeCookieModals(context.page);
context.handleCloudflareChallenge = async (options) => {
return handleCloudflareChallenge(context.page, context.request.url, context.session, options);
};
}
/** @internal */
exports.playwrightUtils = {
injectFile,
injectJQuery,
gotoExtended,
blockRequests,
enqueueLinksByClickingElements: click_elements_1.enqueueLinksByClickingElements,
parseWithCheerio,
infiniteScroll,
saveSnapshot,
compileScript,
closeCookieModals,
RenderingTypePredictor: rendering_type_prediction_1.RenderingTypePredictor,
handleCloudflareChallenge,
};
//# sourceMappingURL=playwright-utils.js.map