UNPKG

@crawlee/puppeteer

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

594 lines • 27.2 kB
"use strict"; /** * A namespace that contains various utilities for * [Puppeteer](https://github.com/puppeteer/puppeteer) - the headless Chrome Node API. * * **Example usage:** * * ```javascript * import { launchPuppeteer, utils } from 'crawlee'; * * // Open https://www.example.com in Puppeteer * const browser = await launchPuppeteer(); * const page = await browser.newPage(); * await page.goto('https://www.example.com'); * * // Inject jQuery into a page * await utils.puppeteer.injectJQuery(page); * ``` * @module puppeteerUtils */ Object.defineProperty(exports, "__esModule", { value: true }); exports.puppeteerUtils = exports.removeInterceptRequestHandler = exports.addInterceptRequestHandler = exports.enqueueLinksByClickingElements = exports.blockResources = void 0; exports.injectFile = injectFile; exports.injectJQuery = injectJQuery; exports.parseWithCheerio = parseWithCheerio; exports.blockRequests = blockRequests; exports.sendCDPCommand = sendCDPCommand; exports.cacheResponses = cacheResponses; exports.compileScript = compileScript; exports.gotoExtended = gotoExtended; exports.infiniteScroll = infiniteScroll; exports.saveSnapshot = saveSnapshot; exports.closeCookieModals = closeCookieModals; exports.registerUtilsToContext = registerUtilsToContext; const tslib_1 = require("tslib"); const promises_1 = require("node:fs/promises"); const node_vm_1 = tslib_1.__importDefault(require("node:vm")); const browser_1 = require("@crawlee/browser"); const utils_1 = require("@crawlee/utils"); const cheerio = tslib_1.__importStar(require("cheerio")); const idcac_playwright_1 = require("idcac-playwright"); const ow_1 = tslib_1.__importDefault(require("ow")); const datastructures_1 = require("@apify/datastructures"); const log_1 = tslib_1.__importDefault(require("@apify/log")); const click_elements_1 = require("../enqueue-links/click-elements"); Object.defineProperty(exports, "enqueueLinksByClickingElements", { enumerable: true, get: function () { return click_elements_1.enqueueLinksByClickingElements; } }); const puppeteer_request_interception_1 = require("./puppeteer_request_interception"); Object.defineProperty(exports, "addInterceptRequestHandler", { enumerable: true, get: function () { return puppeteer_request_interception_1.addInterceptRequestHandler; } }); Object.defineProperty(exports, "removeInterceptRequestHandler", { enumerable: true, get: function () { return puppeteer_request_interception_1.removeInterceptRequestHandler; } }); const jqueryPath = require.resolve('jquery'); const MAX_INJECT_FILE_CACHE_SIZE = 10; const DEFAULT_BLOCK_REQUEST_URL_PATTERNS = ['.css', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip']; const log = log_1.default.child({ prefix: 'Puppeteer Utils' }); /** * Cache contents of previously injected files to limit file system access. */ const injectedFilesCache = new datastructures_1.LruCache({ maxLength: MAX_INJECT_FILE_CACHE_SIZE }); /** * Injects a JavaScript file into a Puppeteer page. * Unlike Puppeteer's `addScriptTag` function, this function works on pages * with arbitrary Cross-Origin Resource Sharing (CORS) policies. * * File contents are cached for up to 10 files to limit file system access. * * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param filePath File path * @param [options] */ async function injectFile(page, filePath, options = {}) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); (0, ow_1.default)(filePath, ow_1.default.string); (0, ow_1.default)(options, ow_1.default.object.exactShape({ surviveNavigations: ow_1.default.optional.boolean, })); let contents = injectedFilesCache.get(filePath); if (!contents) { contents = await (0, promises_1.readFile)(filePath, 'utf8'); injectedFilesCache.add(filePath, contents); } const evalP = page.evaluate(contents); if (options.surviveNavigations) { page.on('framenavigated', async () => page .evaluate(contents) .catch((error) => log.warning('An error occurred during the script injection!', { error }))); } return evalP; } /** * Injects the [jQuery](https://jquery.com/) library into a Puppeteer page. * jQuery is often useful for various web scraping and crawling tasks. * For example, it can help extract text from HTML elements using CSS selectors. * * Beware that the injected jQuery object will be set to the `window.$` variable and thus it might cause conflicts with * other libraries included by the page that use the same variable name (e.g. another version of jQuery). * This can affect functionality of page's scripts. * * The injected jQuery will survive page navigations and reloads by default. * * **Example usage:** * ```javascript * await utils.puppeteer.injectJQuery(page); * const title = await page.evaluate(() => { * return $('head title').text(); * }); * ``` * * Note that `injectJQuery()` does not affect the Puppeteer's * [`page.$()`](https://pptr.dev/api/puppeteer.page._/) * function in any way. * * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param [options.surviveNavigations] Opt-out option to disable the JQuery reinjection after navigation. */ async function injectJQuery(page, options) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); return injectFile(page, jqueryPath, { surviveNavigations: options?.surviveNavigations ?? true }); } /** * Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}. * * **Example usage:** * ```javascript * const $ = await utils.puppeteer.parseWithCheerio(page); * const title = $('title').text(); * ``` * * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param ignoreShadowRoots */ async function parseWithCheerio(page, ignoreShadowRoots = false, ignoreIframes = false) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); if (page.frames().length > 1 && !ignoreIframes) { const frames = await page.$$('iframe'); await Promise.all(frames.map(async (frame) => { try { const iframe = await frame.contentFrame(); if (iframe) { const contents = await iframe.content(); await frame.evaluate((f, c) => { const replacementNode = document.createElement('div'); replacementNode.innerHTML = c; replacementNode.className = 'crawlee-iframe-replacement'; f.replaceWith(replacementNode); }, contents); } } catch (error) { log.warning(`Failed to extract iframe content: ${error}`); } })); } const html = ignoreShadowRoots ? null : (await page.evaluate(`(${utils_1.expandShadowRoots.toString()})(document)`)); const pageContent = html || (await page.content()); return cheerio.load(pageContent); } /** * Forces the Puppeteer browser tab to block loading URLs that match a provided pattern. * This is useful to speed up crawling of websites, since it reduces the amount * of data that needs to be downloaded from the web, but it may break some websites * or unexpectedly prevent loading of resources. * * By default, the function will block all URLs including the following patterns: * * ```json * [".css", ".jpg", ".jpeg", ".png", ".svg", ".gif", ".woff", ".pdf", ".zip"] * ``` * * If you want to extend this list further, use the `extraUrlPatterns` option, * which will keep blocking the default patterns, as well as add your custom ones. * If you would like to block only specific patterns, use the `urlPatterns` option, * which will override the defaults and block only URLs with your custom patterns. * * This function does not use Puppeteer's request interception and therefore does not interfere * with browser cache. It's also faster than blocking requests using interception, * because the blocking happens directly in the browser without the round-trip to Node.js, * but it does not provide the extra benefits of request interception. * * The function will never block main document loads and their respective redirects. * * **Example usage** * ```javascript * import { launchPuppeteer, utils } from 'crawlee'; * * const browser = await launchPuppeteer(); * const page = await browser.newPage(); * * // Block all requests to URLs that include `adsbygoogle.js` and also all defaults. * await utils.puppeteer.blockRequests(page, { * extraUrlPatterns: ['adsbygoogle.js'], * }); * * await page.goto('https://cnn.com'); * ``` * * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param [options] */ async function blockRequests(page, options = {}) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); (0, ow_1.default)(options, ow_1.default.object.exactShape({ urlPatterns: ow_1.default.optional.array.ofType(ow_1.default.string), extraUrlPatterns: ow_1.default.optional.array.ofType(ow_1.default.string), })); const { urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, extraUrlPatterns = [] } = options; const patternsToBlock = [...urlPatterns, ...extraUrlPatterns]; // We use CDP commands instead of request interception as the latter disables caching, which is not ideal await sendCDPCommand(page, 'Network.setBlockedURLs', { urls: patternsToBlock }); } /** * @internal */ async function sendCDPCommand(page, command, ...args) { // In puppeteer 16.x and 17.x, the `_client` method is completely omitted from the types. It's still there and works the same way, but it is hidden. // Puppeteer <= 17 if (Reflect.has(page, '_client')) { const client = Reflect.get(page, '_client'); if (typeof client === 'function') { return client.call(page).send(command, ...args); } return client.send(command, ...args); } const jsonPath = require.resolve('puppeteer/package.json'); const parsed = JSON.parse(await (0, promises_1.readFile)(jsonPath, 'utf-8')); throw new Error(`Cannot detect CDP client for Puppeteer ${parsed.version}. You should report this to Crawlee, mentioning the puppeteer version you are using.`); } /** * `blockResources()` has a high impact on performance in recent versions of Puppeteer. * Until this resolves, please use `utils.puppeteer.blockRequests()`. * @deprecated */ const blockResources = async (page, resourceTypes = ['stylesheet', 'font', 'image', 'media']) => { log.deprecated('utils.puppeteer.blockResources() has a high impact on performance in recent versions of Puppeteer. ' + 'Until this resolves, please use utils.puppeteer.blockRequests()'); await (0, puppeteer_request_interception_1.addInterceptRequestHandler)(page, async (request) => { const type = request.resourceType(); if (resourceTypes.includes(type)) await request.abort(); else await request.continue(); }); }; exports.blockResources = blockResources; /** * *NOTE:* In recent versions of Puppeteer using this function entirely disables browser cache which resolves in sub-optimal * performance. Until this resolves, we suggest just relying on the in-browser cache unless absolutely necessary. * * Enables caching of intercepted responses into a provided object. Automatically enables request interception in Puppeteer. * *IMPORTANT*: Caching responses stores them to memory, so too loose rules could cause memory leaks for longer running crawlers. * This issue should be resolved or atleast mitigated in future iterations of this feature. * @param page * Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param cache * Object in which responses are stored * @param responseUrlRules * List of rules that are used to check if the response should be cached. * String rules are compared as page.url().includes(rule) while RegExp rules are evaluated as rule.test(page.url()). * @deprecated */ async function cacheResponses(page, cache, responseUrlRules) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); (0, ow_1.default)(cache, ow_1.default.object); (0, ow_1.default)(responseUrlRules, ow_1.default.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.regExp))); log.deprecated('utils.puppeteer.cacheResponses() has a high impact on performance ' + "in recent versions of Puppeteer so it's use is discouraged until this issue resolves."); await (0, puppeteer_request_interception_1.addInterceptRequestHandler)(page, async (request) => { const url = request.url(); if (cache[url]) { await request.respond(cache[url]); return; } await request.continue(); }); page.on('response', async (response) => { const url = response.url(); // Response is already cached, do nothing if (cache[url]) return; const shouldCache = responseUrlRules.some((rule) => { if (typeof rule === 'string') return url.includes(rule); if (rule instanceof RegExp) return rule.test(url); return false; }); try { if (shouldCache) { const buffer = await response.buffer(); cache[url] = { status: response.status(), headers: response.headers(), body: buffer, }; } } catch { // ignore errors, usually means that buffer is empty or broken connection } }); } /** * Compiles a Puppeteer script into an async function that may be executed at any time * by providing it with the following object: * ``` * { * page: Page, * request: Request, * } * ``` * Where `page` is a Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) * and `request` is a {@apilink Request}. * * The function is compiled by using the `scriptString` parameter as the function's body, * so any limitations to function bodies apply. Return value of the compiled function * is the return value of the function body = the `scriptString` parameter. * * As a security measure, no globals such as `process` or `require` are accessible * from within the function body. Note that the function does not provide a safe * sandbox and even though globals are not easily accessible, malicious code may * still execute in the main process via prototype manipulation. Therefore you * should only use this function to execute sanitized or safe code. * * Custom context may also be provided using the `context` parameter. To improve security, * make sure to only pass the really necessary objects to the context. Preferably making * secured copies beforehand. */ function compileScript(scriptString, context = Object.create(null)) { const funcString = `async ({ page, request }) => {${scriptString}}`; let func; try { func = node_vm_1.default.runInNewContext(funcString, context); // "Secure" the context by removing prototypes, unless custom context is provided. } catch (err) { log.exception(err, 'Cannot compile script!'); throw err; } if (typeof func !== 'function') throw new Error('Compilation result is not a function!'); // This should not happen... return func; } /** * Extended version of Puppeteer's `page.goto()` allowing to perform requests with HTTP method other than GET, * with custom headers and POST payload. URL, method, headers and payload are taken from * request parameter that must be an instance of Request class. * * *NOTE:* In recent versions of Puppeteer using requests other than GET, overriding headers and adding payloads disables * browser cache which degrades performance. * * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param request * @param [gotoOptions] Custom options for `page.goto()`. */ async function gotoExtended(page, request, gotoOptions = {}) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); (0, ow_1.default)(request, ow_1.default.object.partialShape({ url: ow_1.default.string.url, method: ow_1.default.optional.string, headers: ow_1.default.optional.object, payload: ow_1.default.optional.any(ow_1.default.string, ow_1.default.uint8Array), })); (0, ow_1.default)(gotoOptions, ow_1.default.object); gotoOptions = { ...gotoOptions }; if (gotoOptions.waitUntil === 'networkidle') { gotoOptions.waitUntil = 'networkidle0'; } const { url, method, headers, payload } = request; const isEmpty = (o) => !o || Object.keys(o).length === 0; if (method !== 'GET' || payload || !isEmpty(headers)) { // This is not deprecated, we use it to log only once. log.deprecated('Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance ' + 'in recent versions of Puppeteer. Use only when necessary.'); let wasCalled = false; const interceptRequestHandler = async (interceptedRequest) => { // We want to ensure that this won't get executed again in a case that there is a subsequent request // for example for some asset file link from main HTML. if (wasCalled) { return interceptedRequest.continue(); } wasCalled = true; const overrides = {}; if (method !== 'GET') overrides.method = method; if (payload) overrides.postData = payload; if (!isEmpty(headers)) overrides.headers = headers; await (0, puppeteer_request_interception_1.removeInterceptRequestHandler)(page, interceptRequestHandler); await interceptedRequest.continue(overrides); return undefined; }; await (0, puppeteer_request_interception_1.addInterceptRequestHandler)(page, interceptRequestHandler); } return page.goto(url, gotoOptions); } /** * Scrolls to the bottom of a page, or until it times out. * Loads dynamic content when it hits the bottom of a page, and then continues scrolling. * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param [options] */ async function infiniteScroll(page, options = {}) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); (0, ow_1.default)(options, ow_1.default.object.exactShape({ timeoutSecs: ow_1.default.optional.number, maxScrollHeight: ow_1.default.optional.number, waitForSecs: ow_1.default.optional.number, scrollDownAndUp: ow_1.default.optional.boolean, buttonSelector: ow_1.default.optional.string, stopScrollCallback: ow_1.default.optional.function, })); const { timeoutSecs = 0, maxScrollHeight = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback, } = options; let finished; const startTime = Date.now(); const CHECK_INTERVAL_MILLIS = 1000; const SCROLL_HEIGHT_IF_ZERO = 10000; let scrolledDistance = 0; const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other']; const resourcesStats = { newRequested: 0, oldRequested: 0, matchNumber: 0, }; page.on('request', (msg) => { if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType())) { resourcesStats.newRequested++; } }); // Move mouse to the center of the page, so we can scroll up-down let body = await page.$('body'); let retry = 0; while (!body && retry < 10) { await (0, utils_1.sleep)(100); body = await page.$('body'); retry++; } if (!body) { return; } const boundingBox = await body.boundingBox(); await page.mouse.move(boundingBox.x + boundingBox.width / 2, // x boundingBox.y + boundingBox.height / 2); const checkFinished = setInterval(() => { if (resourcesStats.oldRequested === resourcesStats.newRequested) { resourcesStats.matchNumber++; if (resourcesStats.matchNumber >= waitForSecs) { clearInterval(checkFinished); finished = true; return; } } else { resourcesStats.matchNumber = 0; resourcesStats.oldRequested = resourcesStats.newRequested; } // check if timeout has been reached if (timeoutSecs !== 0 && (Date.now() - startTime) / 1000 > timeoutSecs) { clearInterval(checkFinished); finished = true; } // check if max scroll height has been reached if (maxScrollHeight > 0 && scrolledDistance > maxScrollHeight) { clearInterval(checkFinished); finished = true; } }, CHECK_INTERVAL_MILLIS); const doScroll = async () => { /* istanbul ignore next */ const bodyScrollHeight = await page.evaluate(() => document.body.scrollHeight); const delta = bodyScrollHeight === 0 ? SCROLL_HEIGHT_IF_ZERO : bodyScrollHeight; await page.mouse.wheel({ deltaY: delta }); scrolledDistance += delta; }; const maybeClickButton = async () => { const button = await page.$(buttonSelector); // Box model returns null if the button is not visible if (button && (await button.boxModel())) { await button.click({ delay: 10 }); } }; while (!finished) { await doScroll(); await (0, utils_1.sleep)(250); if (scrollDownAndUp) { await page.mouse.wheel({ deltaY: -1000 }); } if (buttonSelector) { await maybeClickButton(); } if (stopScrollCallback) { if (await stopScrollCallback()) { clearInterval(checkFinished); break; } } } } /** * Saves a full screenshot and HTML of the current page into a Key-Value store. * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. * @param [options] */ async function saveSnapshot(page, options = {}) { (0, ow_1.default)(page, ow_1.default.object.validate(browser_1.validators.browserPage)); (0, ow_1.default)(options, ow_1.default.object.exactShape({ key: ow_1.default.optional.string.nonEmpty, screenshotQuality: ow_1.default.optional.number, saveScreenshot: ow_1.default.optional.boolean, saveHtml: ow_1.default.optional.boolean, keyValueStoreName: ow_1.default.optional.string, config: ow_1.default.optional.object, })); const { key = 'SNAPSHOT', screenshotQuality = 50, saveScreenshot = true, saveHtml = true, keyValueStoreName, config, } = options; try { const store = await browser_1.KeyValueStore.open(keyValueStoreName, { config: config ?? browser_1.Configuration.getGlobalConfig(), }); if (saveScreenshot) { const screenshotName = `${key}.jpg`; const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg', }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } if (saveHtml) { const htmlName = `${key}.html`; const html = await page.content(); await store.setValue(htmlName, html, { contentType: 'text/html' }); } } catch (err) { throw new Error(`saveSnapshot with key ${key} failed.\nCause:${err.message}`); } } async function closeCookieModals(page) { await page.evaluate((0, idcac_playwright_1.getInjectableScript)()); } /** @internal */ function registerUtilsToContext(context, crawlerOptions) { context.injectFile = async (filePath, options) => injectFile(context.page, filePath, options); context.injectJQuery = async () => { if (context.request.state === browser_1.RequestState.BEFORE_NAV) { log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.'); await injectJQuery(context.page); return; } await injectJQuery(context.page, { surviveNavigations: false }); }; context.waitForSelector = async (selector, timeoutMs = 5000) => { await context.page.waitForSelector(selector, { timeout: timeoutMs }); }; context.parseWithCheerio = async (selector, timeoutMs = 5000) => { if (selector) { await context.waitForSelector(selector, timeoutMs); } return parseWithCheerio(context.page, crawlerOptions.ignoreShadowRoots, crawlerOptions.ignoreIframes); }; context.enqueueLinksByClickingElements = async (options) => (0, click_elements_1.enqueueLinksByClickingElements)({ page: context.page, requestQueue: context.crawler.requestQueue, ...options, }); context.blockRequests = async (options) => blockRequests(context.page, options); context.blockResources = async (resourceTypes) => (0, exports.blockResources)(context.page, resourceTypes); context.cacheResponses = async (cache, responseUrlRules) => { return cacheResponses(context.page, cache, responseUrlRules); }; context.compileScript = (scriptString, ctx) => compileScript(scriptString, ctx); context.addInterceptRequestHandler = async (handler) => (0, puppeteer_request_interception_1.addInterceptRequestHandler)(context.page, handler); context.removeInterceptRequestHandler = async (handler) => (0, puppeteer_request_interception_1.removeInterceptRequestHandler)(context.page, handler); context.infiniteScroll = async (options) => infiniteScroll(context.page, options); context.saveSnapshot = async (options) => saveSnapshot(context.page, { ...options, config: context.crawler.config }); context.closeCookieModals = async () => closeCookieModals(context.page); } /** @internal */ exports.puppeteerUtils = { injectFile, injectJQuery, enqueueLinksByClickingElements: click_elements_1.enqueueLinksByClickingElements, blockRequests, blockResources: exports.blockResources, cacheResponses, compileScript, gotoExtended, addInterceptRequestHandler: puppeteer_request_interception_1.addInterceptRequestHandler, removeInterceptRequestHandler: puppeteer_request_interception_1.removeInterceptRequestHandler, infiniteScroll, saveSnapshot, parseWithCheerio, closeCookieModals, }; //# sourceMappingURL=puppeteer_utils.js.map