@crawlee/puppeteer
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
339 lines • 15.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.enqueueLinksByClickingElements = enqueueLinksByClickingElements;
exports.clickElementsAndInterceptNavigationRequests = clickElementsAndInterceptNavigationRequests;
exports.isTargetRelevant = isTargetRelevant;
exports.clickElements = clickElements;
const tslib_1 = require("tslib");
const node_url_1 = require("node:url");
const browser_1 = require("@crawlee/browser");
const ow_1 = tslib_1.__importDefault(require("ow"));
const log_1 = tslib_1.__importDefault(require("@apify/log"));
const puppeteer_request_interception_1 = require("../utils/puppeteer_request_interception");
const STARTING_Z_INDEX = 2147400000;
const log = log_1.default.child({ prefix: 'Puppeteer Click Elements' });
/**
* The function finds elements matching a specific CSS selector in a Puppeteer page,
* clicks all those elements using a mouse move and a left mouse button click and intercepts
* all the navigation requests that are subsequently produced by the page. The intercepted
* requests, including their methods, headers and payloads are then enqueued to a provided
* {@apilink RequestQueue}. This is useful to crawl JavaScript heavy pages where links are not available
* in `href` elements, but rather navigations are triggered in click handlers.
* If you're looking to find URLs in `href` attributes of the page, see {@apilink enqueueLinks}.
*
* Optionally, the function allows you to filter the target links' URLs using an array of {@apilink PseudoUrl} objects
* and override settings of the enqueued {@apilink Request} objects.
*
* **IMPORTANT**: To be able to do this, this function uses various mutations on the page,
* such as changing the Z-index of elements being clicked and their visibility. Therefore,
* it is recommended to only use this function as the last operation in the page.
*
* **USING HEADFUL BROWSER**: When using a headful browser, this function will only be able to click elements
* in the focused tab, effectively limiting concurrency to 1. In headless mode, full concurrency can be achieved.
*
* **PERFORMANCE**: Clicking elements with a mouse and intercepting requests is not a low level operation
* that takes nanoseconds. It's not very CPU intensive, but it takes time. We strongly recommend limiting
* the scope of the clicking as much as possible by using a specific selector that targets only the elements
* that you assume or know will produce a navigation. You can certainly click everything by using
* the `*` selector, but be prepared to wait minutes to get results on a large and complex page.
*
* **Example usage**
*
* ```javascript
* await utils.puppeteer.enqueueLinksByClickingElements({
* page,
* requestQueue,
* selector: 'a.product-detail',
* pseudoUrls: [
* 'https://www.example.com/handbags/[.*]'
* 'https://www.example.com/purses/[.*]'
* ],
* });
* ```
*
* @returns Promise that resolves to {@apilink BatchAddRequestsResult} object.
*/
async function enqueueLinksByClickingElements(options) {
(0, ow_1.default)(options, ow_1.default.object.exactShape({
page: ow_1.default.object.hasKeys('goto', 'evaluate'),
requestQueue: ow_1.default.object.hasKeys('fetchNextRequest', 'addRequest'),
selector: ow_1.default.string,
userData: ow_1.default.optional.object,
clickOptions: ow_1.default.optional.object.hasKeys('clickCount', 'delay'),
pseudoUrls: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.hasKeys('purl'))),
globs: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.hasKeys('glob'))),
regexps: ow_1.default.optional.array.ofType(ow_1.default.any(ow_1.default.regExp, ow_1.default.object.hasKeys('regexp'))),
transformRequestFunction: ow_1.default.optional.function,
waitForPageIdleSecs: ow_1.default.optional.number,
maxWaitForPageIdleSecs: ow_1.default.optional.number,
label: ow_1.default.optional.string,
forefront: ow_1.default.optional.boolean,
skipNavigation: ow_1.default.optional.boolean,
}));
const { page, requestQueue, selector, clickOptions, pseudoUrls, globs, regexps, transformRequestFunction, waitForPageIdleSecs = 1, maxWaitForPageIdleSecs = 5, forefront, } = options;
const waitForPageIdleMillis = waitForPageIdleSecs * 1000;
const maxWaitForPageIdleMillis = maxWaitForPageIdleSecs * 1000;
const urlPatternObjects = [];
if (pseudoUrls?.length) {
log.deprecated('`pseudoUrls` option is deprecated, use `globs` or `regexps` instead');
urlPatternObjects.push(...(0, browser_1.constructRegExpObjectsFromPseudoUrls)(pseudoUrls));
}
if (globs?.length) {
urlPatternObjects.push(...(0, browser_1.constructGlobObjectsFromGlobs)(globs));
}
if (regexps?.length) {
urlPatternObjects.push(...(0, browser_1.constructRegExpObjectsFromRegExps)(regexps));
}
const interceptedRequests = await clickElementsAndInterceptNavigationRequests({
page,
selector,
waitForPageIdleMillis,
maxWaitForPageIdleMillis,
clickOptions,
});
let requestOptions = (0, browser_1.createRequestOptions)(interceptedRequests, options);
if (transformRequestFunction) {
requestOptions = requestOptions.map(transformRequestFunction).filter((r) => !!r);
}
const requests = (0, browser_1.createRequests)(requestOptions, urlPatternObjects);
const { addedRequests } = await requestQueue.addRequestsBatched(requests, { forefront });
return { processedRequests: addedRequests, unprocessedRequests: [] };
}
/**
* Clicks all elements of given page matching given selector.
* Catches and intercepts all initiated navigation requests and opened pages.
* Returns a list of all target URLs.
* @ignore
*/
async function clickElementsAndInterceptNavigationRequests(options) {
const { page, selector, waitForPageIdleMillis, maxWaitForPageIdleMillis, clickOptions } = options;
const uniqueRequests = new Set();
const browser = page.browser();
const onInterceptedRequest = createInterceptRequestHandler(page, uniqueRequests);
const onTargetCreated = createTargetCreatedHandler(page, uniqueRequests);
const onFrameNavigated = createFrameNavigatedHandler(page, uniqueRequests);
await (0, puppeteer_request_interception_1.addInterceptRequestHandler)(page, onInterceptedRequest);
browser.on('targetcreated', onTargetCreated);
page.on('framenavigated', onFrameNavigated);
await preventHistoryNavigation(page);
await clickElements(page, selector, clickOptions);
await waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdleMillis });
await restoreHistoryNavigationAndSaveCapturedUrls(page, uniqueRequests);
browser.off('targetcreated', onTargetCreated);
page.off('framenavigated', onFrameNavigated);
await (0, puppeteer_request_interception_1.removeInterceptRequestHandler)(page, onInterceptedRequest);
const serializedRequests = Array.from(uniqueRequests);
return serializedRequests.map((r) => JSON.parse(r));
}
/**
* @ignore
*/
function createInterceptRequestHandler(page, requests) {
return async function onInterceptedRequest(req) {
if (!isTopFrameNavigationRequest(page, req))
return req.continue();
const url = req.url();
requests.add(JSON.stringify({
url,
headers: req.headers(),
method: req.method(),
payload: req.postData(),
}));
if (req.redirectChain().length) {
await req.respond({ body: '' }); // Prevents 301/302 redirect
}
else {
await req.abort('aborted'); // Prevents navigation by js
}
return undefined;
};
}
/**
* @ignore
*/
function isTopFrameNavigationRequest(page, req) {
return req.isNavigationRequest() && req.frame() === page.mainFrame();
}
/**
* @ignore
*/
function createTargetCreatedHandler(page, requests) {
return async function onTargetCreated(target) {
if (!isTargetRelevant(page, target))
return;
const url = target.url();
requests.add(JSON.stringify({ url }));
// We want to close the page but don't care about
// possible errors like target closed.
try {
const createdPage = await target.page();
await createdPage.close();
}
catch (err) {
log.debug('enqueueLinksByClickingElements: Could not close spawned page.', { error: err.stack });
}
};
}
/**
* We're only interested in pages created by the page we're currently clicking in.
* There will generally be a lot of other targets being created in the browser.
*/
function isTargetRelevant(page, target) {
return target.type() === 'page' && page.target() === target.opener();
}
/**
* @ignore
*/
function createFrameNavigatedHandler(page, requests) {
return function onFrameNavigated(frame) {
if (frame !== page.mainFrame())
return;
const url = frame.url();
requests.add(JSON.stringify({ url }));
};
}
/**
* @ignore
*/
async function preventHistoryNavigation(page) {
/* istanbul ignore next */
return page.evaluate(() => {
window.__originalHistory__ = window.history;
delete window.history; // Simple override does not work.
window.history = {
stateHistory: [],
length: 0,
state: {},
go() { },
back() { },
forward() { },
pushState(...args) {
this.stateHistory.push(args);
},
replaceState(...args) {
this.stateHistory.push(args);
},
};
});
}
/**
* Click all elements matching the given selector. To be able to do this using
* Puppeteer's `.click()` we need to make sure the elements are reachable by mouse,
* so we first move them to the top of the page's stacking context and then click.
* We do all in series to prevent elements from hiding one another. Therefore,
* for large element sets, this will take considerable amount of time.
* @ignore
*/
async function clickElements(page, selector, clickOptions) {
const elementHandles = await page.$$(selector);
log.debug(`enqueueLinksByClickingElements: There are ${elementHandles.length} elements to click.`);
let clickedElementsCount = 0;
let zIndex = STARTING_Z_INDEX;
let shouldLogWarning = true;
for (const handle of elementHandles) {
try {
await page.evaluate(updateElementCssToEnableMouseClick, handle, zIndex++);
await handle.click(clickOptions);
clickedElementsCount++;
}
catch (err) {
const e = err;
if (shouldLogWarning && e.stack.includes('is detached from document')) {
log.warning(`An element with selector ${selector} that you're trying to click has been removed from the page. ` +
'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. ' +
'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.');
shouldLogWarning = false;
}
log.debug('enqueueLinksByClickingElements: Click failed.', { stack: e.stack });
}
}
log.debug(`enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`);
}
/* istanbul ignore next */
/**
* This is an in browser function!
*/
function updateElementCssToEnableMouseClick(el, zIndex) {
const casted = el;
casted.style.visibility = 'visible';
casted.style.display = 'block';
casted.style.position = 'fixed';
casted.style.zIndex = String(zIndex);
casted.style.left = '0';
casted.style.top = '0';
const boundingRect = casted.getBoundingClientRect();
if (!boundingRect.height)
casted.style.height = '10px';
if (!boundingRect.width)
casted.style.width = '10px';
}
/**
* This function tracks whether any requests, frame navigations or targets were emitted
* in the past idleIntervalMillis and whenever the interval registers no activity,
* the function returns.
*
* It will also return when a final timeout, represented by the timeoutMillis parameter
* is reached, to prevent blocking on pages with constant network activity.
*
* We need this to make sure we don't finish too soon when intercepting requests triggered
* by clicking in the page. They often get registered by the Node.js process only some
* milliseconds after clicking and we would lose those requests. This is especially prevalent
* when there's only a single element to click.
* @ignore
*/
async function waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdleMillis, }) {
return new Promise((resolve) => {
let timeout;
let maxTimeout;
const context = page.browserContext();
function newTabTracker(target) {
if (isTargetRelevant(page, target))
activityHandler();
}
function activityHandler() {
clearTimeout(timeout);
timeout = setTimeout(() => {
clearTimeout(maxTimeout);
finish();
}, waitForPageIdleMillis);
}
function maxTimeoutHandler() {
log.debug(`enqueueLinksByClickingElements: Page still showed activity after ${maxWaitForPageIdleMillis}ms. ` +
'This is probably due to the website itself dispatching requests, but some links may also have been missed.');
finish();
}
function finish() {
page.off('request', activityHandler);
page.off('framenavigated', activityHandler);
context.off('targetcreated', newTabTracker);
resolve();
}
maxTimeout = setTimeout(maxTimeoutHandler, maxWaitForPageIdleMillis);
activityHandler(); // We call this once manually in case there would be no requests at all.
page.on('request', activityHandler);
page.on('framenavigated', activityHandler);
context.on('targetcreated', newTabTracker);
});
}
/**
* @ignore
*/
async function restoreHistoryNavigationAndSaveCapturedUrls(page, requests) {
/* istanbul ignore next */
const state = await page.evaluate(() => {
const { stateHistory } = window.history;
window.history = window.__originalHistory__;
return stateHistory;
});
state.forEach((args) => {
try {
const stateUrl = args[args.length - 1];
const url = new node_url_1.URL(stateUrl, page.url()).href;
requests.add(JSON.stringify({ url }));
}
catch (err) {
log.debug('enqueueLinksByClickingElements: Failed to ', { error: err.stack });
}
});
}
//# sourceMappingURL=click-elements.js.map