UNPKG

@crawlee/browser

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

514 lines • 25.7 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.BrowserCrawler = void 0; exports.browserCrawlerEnqueueLinks = browserCrawlerEnqueueLinks; exports.extractUrlsFromPage = extractUrlsFromPage; const tslib_1 = require("tslib"); const basic_1 = require("@crawlee/basic"); const browser_pool_1 = require("@crawlee/browser-pool"); const utils_1 = require("@crawlee/utils"); const ow_1 = tslib_1.__importDefault(require("ow")); const timeout_1 = require("@apify/timeout"); /** * Provides a simple framework for parallel crawling of web pages * using headless browsers with [Puppeteer](https://github.com/puppeteer/puppeteer) * and [Playwright](https://github.com/microsoft/playwright). * The URLs to crawl are fed either from a static list of URLs * or from a dynamic queue of URLs enabling recursive crawling of websites. * * Since `BrowserCrawler` uses headless (or even headful) browsers to download web pages and extract data, * it is useful for crawling of websites that require to execute JavaScript. * If the target website doesn't need JavaScript, we should consider using the {@link CheerioCrawler}, * which downloads the pages using raw HTTP requests and is about 10x faster. * * The source URLs are represented by the {@link Request} objects that are fed from the {@link RequestList} or {@link RequestQueue} instances * provided by the {@link BrowserCrawlerOptions.requestList|`requestList`} or {@link BrowserCrawlerOptions.requestQueue|`requestQueue`} * constructor options, respectively. If neither `requestList` nor `requestQueue` options are provided, * the crawler will open the default request queue either when the {@link BrowserCrawler.addRequests|`crawler.addRequests()`} function is called, * or if `requests` parameter (representing the initial requests) of the {@link BrowserCrawler.run|`crawler.run()`} function is provided. * * If both {@link BrowserCrawlerOptions.requestList|`requestList`} and {@link BrowserCrawlerOptions.requestQueue|`requestQueue`} options are used, * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them * to the {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * `BrowserCrawler` opens a new browser page (i.e. tab or window) for each {@link Request} object to crawl * and then calls the function provided by user as the {@link BrowserCrawlerOptions.requestHandler|`requestHandler`} option. * * New pages are only opened when there is enough free CPU and memory available, * using the functionality provided by the {@link AutoscaledPool} class. * All {@link AutoscaledPool} configuration options can be passed to the {@link BrowserCrawlerOptions.autoscaledPoolOptions|`autoscaledPoolOptions`} * parameter of the `BrowserCrawler` constructor. * For user convenience, the {@link AutoscaledPoolOptions.minConcurrency|`minConcurrency`} and * {@link AutoscaledPoolOptions.maxConcurrency|`maxConcurrency`} options of the * underlying {@link AutoscaledPool} constructor are available directly in the `BrowserCrawler` constructor. * * > *NOTE:* the pool of browser instances is internally managed by the {@link BrowserPool} class. * * @category Crawlers */ class BrowserCrawler extends basic_1.BasicCrawler { /** * All `BrowserCrawler` parameters are passed via an options object. */ constructor(options = {}, config = basic_1.Configuration.getGlobalConfig()) { var _a; (0, ow_1.default)(options, 'BrowserCrawlerOptions', ow_1.default.object.exactShape(BrowserCrawler.optionsShape)); const { navigationTimeoutSecs = 60, requestHandlerTimeoutSecs = 60, persistCookiesPerSession, proxyConfiguration, launchContext = {}, browserPoolOptions, preNavigationHooks = [], postNavigationHooks = [], // Ignored handleRequestFunction, requestHandler: userProvidedRequestHandler, handlePageFunction, failedRequestHandler, handleFailedRequestFunction, headless, ignoreShadowRoots, ignoreIframes, ...basicCrawlerOptions } = options; super({ ...basicCrawlerOptions, requestHandler: async (...args) => this._runRequestHandler(...args), requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + basic_1.BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, }, config); Object.defineProperty(this, "config", { enumerable: true, configurable: true, writable: true, value: config }); /** * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies. * Only available if used by the crawler. */ Object.defineProperty(this, "proxyConfiguration", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * A reference to the underlying {@link BrowserPool} class that manages the crawler's browsers. */ Object.defineProperty(this, "browserPool", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "launchContext", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "userProvidedRequestHandler", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "navigationTimeoutMillis", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "requestHandlerTimeoutInnerMillis", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "preNavigationHooks", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "postNavigationHooks", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "persistCookiesPerSession", { enumerable: true, configurable: true, writable: true, value: void 0 }); this._handlePropertyNameChange({ newName: 'requestHandler', oldName: 'handlePageFunction', propertyKey: 'userProvidedRequestHandler', newProperty: userProvidedRequestHandler, oldProperty: handlePageFunction, allowUndefined: true, // fallback to the default router }); if (!this.userProvidedRequestHandler) { this.userProvidedRequestHandler = this.router; } this._handlePropertyNameChange({ newName: 'failedRequestHandler', oldName: 'handleFailedRequestFunction', propertyKey: 'failedRequestHandler', newProperty: failedRequestHandler, oldProperty: handleFailedRequestFunction, allowUndefined: true, }); // Cookies should be persisted per session only if session pool is used if (!this.useSessionPool && persistCookiesPerSession) { throw new Error('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.'); } this.launchContext = launchContext; this.navigationTimeoutMillis = navigationTimeoutSecs * 1000; this.requestHandlerTimeoutInnerMillis = requestHandlerTimeoutSecs * 1000; this.proxyConfiguration = proxyConfiguration; this.preNavigationHooks = preNavigationHooks; this.postNavigationHooks = postNavigationHooks; if (headless != null) { (_a = this.launchContext).launchOptions ?? (_a.launchOptions = {}); this.launchContext.launchOptions.headless = headless; } if (this.useSessionPool) { this.persistCookiesPerSession = persistCookiesPerSession !== undefined ? persistCookiesPerSession : true; } else { this.persistCookiesPerSession = false; } if (launchContext?.userAgent) { if (browserPoolOptions.useFingerprints) this.log.info('Custom user agent provided, disabling automatic browser fingerprint injection!'); browserPoolOptions.useFingerprints = false; } const { preLaunchHooks = [], postLaunchHooks = [], ...rest } = browserPoolOptions; this.browserPool = new browser_pool_1.BrowserPool({ ...rest, preLaunchHooks: [this._extendLaunchContext.bind(this), ...preLaunchHooks], postLaunchHooks: [this._maybeAddSessionRetiredListener.bind(this), ...postLaunchHooks], }); } async _cleanupContext(crawlingContext) { const { page } = crawlingContext; // Page creation may be aborted if (page) { await page.close().catch((error) => this.log.debug('Error while closing page', { error })); } } async containsSelectors(page, selectors) { const foundSelectors = (await Promise.all(selectors.map((selector) => page.$(selector)))) .map((x, i) => [x, selectors[i]]) .filter(([x]) => x !== null) .map(([, selector]) => selector); return foundSelectors.length > 0 ? foundSelectors : null; } async isRequestBlocked(crawlingContext) { const { page, response } = crawlingContext; const blockedStatusCodes = // eslint-disable-next-line dot-notation (this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0 ? // eslint-disable-next-line dot-notation this.sessionPool['blockedStatusCodes'] : basic_1.BLOCKED_STATUS_CODES; // Cloudflare specific heuristic - wait 5 seconds if we get a 403 for the JS challenge to load / resolve. if ((await this.containsSelectors(page, utils_1.CLOUDFLARE_RETRY_CSS_SELECTORS)) && response?.status() === 403) { await (0, utils_1.sleep)(5000); // here we cannot test for response code, because we only have the original response, not the possible Cloudflare redirect on passed challenge. const foundSelectors = await this.containsSelectors(page, utils_1.RETRY_CSS_SELECTORS); if (!foundSelectors) return false; return `Cloudflare challenge failed, found selectors: ${foundSelectors.join(', ')}`; } const foundSelectors = await this.containsSelectors(page, utils_1.RETRY_CSS_SELECTORS); const blockedStatusCode = blockedStatusCodes.find((x) => x === (response?.status() ?? 0)); if (foundSelectors) return `Found selectors: ${foundSelectors.join(', ')}`; if (blockedStatusCode) return `Received blocked status code: ${blockedStatusCode}`; return false; } /** * Wrapper around requestHandler that opens and closes pages etc. */ async _runRequestHandler(crawlingContext) { const newPageOptions = { id: crawlingContext.id, }; const useIncognitoPages = this.launchContext?.useIncognitoPages; const experimentalContainers = this.launchContext?.experimentalContainers; if (this.proxyConfiguration) { const { session } = crawlingContext; const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id, { request: crawlingContext.request, }); crawlingContext.proxyInfo = proxyInfo; newPageOptions.proxyUrl = proxyInfo?.url; newPageOptions.proxyTier = proxyInfo?.proxyTier; if (this.proxyConfiguration.isManInTheMiddle) { /** * @see https://playwright.dev/docs/api/class-browser/#browser-new-context * @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md */ newPageOptions.pageOptions = { ignoreHTTPSErrors: true, acceptInsecureCerts: true, }; } } const page = (await this.browserPool.newPage(newPageOptions)); (0, timeout_1.tryCancel)(); this._enhanceCrawlingContextWithPageInfo(crawlingContext, page, useIncognitoPages || experimentalContainers); // DO NOT MOVE THIS LINE ABOVE! // `enhanceCrawlingContextWithPageInfo` gives us a valid session. // For example, `sessionPoolOptions.sessionOptions.maxUsageCount` can be `1`. // So we must not save the session prior to making sure it was used only once, otherwise we would use it twice. const { request, session } = crawlingContext; if (!request.skipNavigation) { await this._handleNavigation(crawlingContext); (0, timeout_1.tryCancel)(); await this._responseHandler(crawlingContext); (0, timeout_1.tryCancel)(); // save cookies // TODO: Should we save the cookies also after/only the handle page? if (this.persistCookiesPerSession) { const cookies = await crawlingContext.browserController.getCookies(page); (0, timeout_1.tryCancel)(); session?.setCookies(cookies, request.loadedUrl); } } if (!this.requestMatchesEnqueueStrategy(request)) { this.log.debug( // eslint-disable-next-line dot-notation `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`); request.noRetry = true; request.state = basic_1.RequestState.SKIPPED; await this.handleSkippedRequest({ url: request.url, reason: 'redirect' }); return; } if (this.retryOnBlocked) { const error = await this.isRequestBlocked(crawlingContext); if (error) throw new basic_1.SessionError(error); } request.state = basic_1.RequestState.REQUEST_HANDLER; try { await (0, timeout_1.addTimeoutToPromise)(async () => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), this.requestHandlerTimeoutInnerMillis, `requestHandler timed out after ${this.requestHandlerTimeoutInnerMillis / 1000} seconds.`); request.state = basic_1.RequestState.DONE; } catch (e) { request.state = basic_1.RequestState.ERROR; throw e; } (0, timeout_1.tryCancel)(); } _enhanceCrawlingContextWithPageInfo(crawlingContext, page, createNewSession) { crawlingContext.page = page; // This switch is because the crawlingContexts are created on per request basis. // However, we need to add the proxy info and session from browser, which is created based on the browser-pool configuration. // We would not have to do this switch if the proxy and configuration worked as in CheerioCrawler, // which configures proxy and session for every new request const browserControllerInstance = this.browserPool.getBrowserControllerByPage(page); crawlingContext.browserController = browserControllerInstance; if (!createNewSession) { crawlingContext.session = browserControllerInstance.launchContext.session; } if (!crawlingContext.proxyInfo) { crawlingContext.proxyInfo = browserControllerInstance.launchContext.proxyInfo; } const contextEnqueueLinks = crawlingContext.enqueueLinks; crawlingContext.enqueueLinks = async (enqueueOptions) => { return browserCrawlerEnqueueLinks({ options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) }, page, requestQueue: await this.getRequestQueue(), robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url), onSkippedRequest: this.handleSkippedRequest, originalRequestUrl: crawlingContext.request.url, finalRequestUrl: crawlingContext.request.loadedUrl, enqueueLinks: contextEnqueueLinks, }); }; } async _handleNavigation(crawlingContext) { const gotoOptions = { timeout: this.navigationTimeoutMillis }; const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request); crawlingContext.request.state = basic_1.RequestState.BEFORE_NAV; await this._executeHooks(this.preNavigationHooks, crawlingContext, gotoOptions); (0, timeout_1.tryCancel)(); const postNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request); await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies); try { crawlingContext.response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined; } catch (error) { await this._handleNavigationTimeout(crawlingContext, error); crawlingContext.request.state = basic_1.RequestState.ERROR; this._throwIfProxyError(error); throw error; } (0, timeout_1.tryCancel)(); crawlingContext.request.state = basic_1.RequestState.AFTER_NAV; await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions); } async _applyCookies({ session, request, page, browserController }, preHooksCookies, postHooksCookies) { const sessionCookie = session?.getCookies(request.url) ?? []; const parsedPreHooksCookies = preHooksCookies.split(/ *; */).map((c) => (0, basic_1.cookieStringToToughCookie)(c)); const parsedPostHooksCookies = postHooksCookies.split(/ *; */).map((c) => (0, basic_1.cookieStringToToughCookie)(c)); await browserController.setCookies(page, [...sessionCookie, ...parsedPreHooksCookies, ...parsedPostHooksCookies] .filter((c) => typeof c !== 'undefined' && c !== null) .map((c) => ({ ...c, url: c.domain ? undefined : request.url }))); } /** * Marks session bad in case of navigation timeout. */ async _handleNavigationTimeout(crawlingContext, error) { const { session } = crawlingContext; if (error && error.constructor.name === 'TimeoutError') { (0, basic_1.handleRequestTimeout)({ session, errorMessage: error.message }); } await crawlingContext.page.close(); } /** * Transforms proxy-related errors to `SessionError`. */ _throwIfProxyError(error) { if (this.isProxyError(error)) { throw new basic_1.SessionError(this._getMessageFromError(error)); } } /** * Should be overridden in case of different automation library that does not support this response API. */ async _responseHandler(crawlingContext) { const { response, session, request, page } = crawlingContext; if (typeof response === 'object' && typeof response.status === 'function') { const status = response.status(); this.stats.registerStatusCode(status); } if (this.sessionPool && response && session) { if (typeof response === 'object' && typeof response.status === 'function') { this._throwOnBlockedRequest(session, response.status()); } else { this.log.debug('Got a malformed Browser response.', { request, response }); } } request.loadedUrl = await page.url(); } async _extendLaunchContext(_pageId, launchContext) { const launchContextExtends = {}; if (this.sessionPool) { launchContextExtends.session = await this.sessionPool.getSession(); } if (this.proxyConfiguration && !launchContext.proxyUrl) { const proxyInfo = await this.proxyConfiguration.newProxyInfo(launchContextExtends.session?.id, { proxyTier: launchContext.proxyTier ?? undefined, }); launchContext.proxyUrl = proxyInfo?.url; launchContextExtends.proxyInfo = proxyInfo; // Disable SSL verification for MITM proxies if (this.proxyConfiguration.isManInTheMiddle) { /** * @see https://playwright.dev/docs/api/class-browser/#browser-new-context * @see https://github.com/puppeteer/puppeteer/blob/main/docs/api.md */ launchContext.launchOptions.ignoreHTTPSErrors = true; launchContext.launchOptions.acceptInsecureCerts = true; } } launchContext.extend(launchContextExtends); } _maybeAddSessionRetiredListener(_pageId, browserController) { if (this.sessionPool) { const listener = (session) => { const { launchContext } = browserController; if (session.id === launchContext.session.id) { this.browserPool.retireBrowserController(browserController); } }; this.sessionPool.on(basic_1.EVENT_SESSION_RETIRED, listener); browserController.on("browserClosed" /* BROWSER_CONTROLLER_EVENTS.BROWSER_CLOSED */, () => { return this.sessionPool.removeListener(basic_1.EVENT_SESSION_RETIRED, listener); }); } } /** * Function for cleaning up after all requests are processed. * @ignore */ async teardown() { await this.browserPool.destroy(); await super.teardown(); } } exports.BrowserCrawler = BrowserCrawler; Object.defineProperty(BrowserCrawler, "optionsShape", { enumerable: true, configurable: true, writable: true, value: { ...basic_1.BasicCrawler.optionsShape, handlePageFunction: ow_1.default.optional.function, navigationTimeoutSecs: ow_1.default.optional.number.greaterThan(0), preNavigationHooks: ow_1.default.optional.array, postNavigationHooks: ow_1.default.optional.array, launchContext: ow_1.default.optional.object, headless: ow_1.default.optional.any(ow_1.default.boolean, ow_1.default.string), browserPoolOptions: ow_1.default.object, sessionPoolOptions: ow_1.default.optional.object, persistCookiesPerSession: ow_1.default.optional.boolean, useSessionPool: ow_1.default.optional.boolean, proxyConfiguration: ow_1.default.optional.object.validate(basic_1.validators.proxyConfiguration), ignoreShadowRoots: ow_1.default.optional.boolean, ignoreIframes: ow_1.default.optional.boolean, } }); /** @internal */ function containsEnqueueLinks(options) { return !!options.enqueueLinks; } /** @internal */ async function browserCrawlerEnqueueLinks(options) { const { options: enqueueLinksOptions, finalRequestUrl, originalRequestUrl, page } = options; const baseUrl = (0, basic_1.resolveBaseUrlForEnqueueLinksFiltering)({ enqueueStrategy: enqueueLinksOptions?.strategy, finalRequestUrl, originalRequestUrl, userProvidedBaseUrl: enqueueLinksOptions?.baseUrl, }); const urls = await extractUrlsFromPage(page, enqueueLinksOptions?.selector ?? 'a', enqueueLinksOptions?.baseUrl ?? finalRequestUrl ?? originalRequestUrl); if (containsEnqueueLinks(options)) { return options.enqueueLinks({ urls, baseUrl, ...enqueueLinksOptions, }); } return (0, basic_1.enqueueLinks)({ requestQueue: options.requestQueue, robotsTxtFile: options.robotsTxtFile, onSkippedRequest: options.onSkippedRequest, urls, baseUrl, ...enqueueLinksOptions, }); } /** * Extracts URLs from a given page. * @ignore */ async function extractUrlsFromPage( // eslint-disable-next-line @typescript-eslint/no-unsafe-function-type page, selector, baseUrl) { const urls = (await page.$$eval(selector, (linkEls) => linkEls.map((link) => link.getAttribute('href')).filter((href) => !!href))) ?? []; const [base] = await page.$$eval('base', (els) => els.map((el) => el.getAttribute('href'))); const absoluteBaseUrl = base && (0, basic_1.tryAbsoluteURL)(base, baseUrl); if (absoluteBaseUrl) { baseUrl = absoluteBaseUrl; } return urls .map((href) => { // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. if (!isHrefAbsolute && !baseUrl) { throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. ` + 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.'); } return baseUrl ? (0, basic_1.tryAbsoluteURL)(href, baseUrl) : href; }) .filter((href) => !!href); } //# sourceMappingURL=browser-crawler.js.map