UNPKG

@crawlee/playwright

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

472 lines • 23.7 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.AdaptivePlaywrightCrawler = void 0; exports.createAdaptivePlaywrightRouter = createAdaptivePlaywrightRouter; const tslib_1 = require("tslib"); const browser_1 = require("@crawlee/browser"); const core_1 = require("@crawlee/core"); const utils_1 = require("@crawlee/utils"); const cheerio_1 = require("cheerio"); const lodash_isequal_1 = tslib_1.__importDefault(require("lodash.isequal")); const timeout_1 = require("@apify/timeout"); const playwright_crawler_1 = require("./playwright-crawler"); const rendering_type_prediction_1 = require("./utils/rendering-type-prediction"); class AdaptivePlaywrightCrawlerStatistics extends core_1.Statistics { constructor(options = {}) { super(options); Object.defineProperty(this, "state", { enumerable: true, configurable: true, writable: true, value: null }); // this needs to be assigned for a valid override, but the initialization is done by a reset() call from the parent constructor this.reset(); } reset() { super.reset(); this.state.httpOnlyRequestHandlerRuns = 0; this.state.browserRequestHandlerRuns = 0; this.state.renderingTypeMispredictions = 0; } async _maybeLoadStatistics() { await super._maybeLoadStatistics(); const savedState = await this.keyValueStore?.getValue(this.persistStateKey); if (!savedState) { return; } this.state.httpOnlyRequestHandlerRuns = savedState.httpOnlyRequestHandlerRuns; this.state.browserRequestHandlerRuns = savedState.browserRequestHandlerRuns; this.state.renderingTypeMispredictions = savedState.renderingTypeMispredictions; } trackHttpOnlyRequestHandlerRun() { var _a; (_a = this.state).httpOnlyRequestHandlerRuns ?? (_a.httpOnlyRequestHandlerRuns = 0); this.state.httpOnlyRequestHandlerRuns += 1; } trackBrowserRequestHandlerRun() { var _a; (_a = this.state).browserRequestHandlerRuns ?? (_a.browserRequestHandlerRuns = 0); this.state.browserRequestHandlerRuns += 1; } trackRenderingTypeMisprediction() { var _a; (_a = this.state).renderingTypeMispredictions ?? (_a.renderingTypeMispredictions = 0); this.state.renderingTypeMispredictions += 1; } } const proxyLogMethods = [ 'error', 'exception', 'softFail', 'info', 'debug', 'perf', 'warningOnce', 'deprecated', ]; /** * An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. * * **Example usage:** * * ```javascript * const crawler = new AdaptivePlaywrightCrawler({ * renderingTypeDetectionRatio: 0.1, * async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) { * // This function is called to extract data from a single web page * const $prices = await querySelector('span.price') * * await pushData({ * url: request.url, * price: $prices.filter(':contains("$")').first().text(), * }) * * await enqueueLinks({ selector: '.pagination a' }) * }, * }); * * await crawler.run([ * 'http://www.example.com/page-1', * 'http://www.example.com/page-2', * ]); * ``` * * @experimental */ class AdaptivePlaywrightCrawler extends playwright_crawler_1.PlaywrightCrawler { constructor(options = {}, config = core_1.Configuration.getGlobalConfig()) { const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, ...rest } = options; super(rest, config); Object.defineProperty(this, "config", { enumerable: true, configurable: true, writable: true, value: config }); Object.defineProperty(this, "adaptiveRequestHandler", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "renderingTypePredictor", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "resultChecker", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "resultComparator", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "preventDirectStorageAccess", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "inFlightRenderingTypeDetections", { enumerable: true, configurable: true, writable: true, value: 0 }); /** * Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}. * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}. */ // @ts-ignore Object.defineProperty(this, "router", { enumerable: true, configurable: true, writable: true, value: core_1.Router.create() }); this.adaptiveRequestHandler = requestHandler ?? this.router; this.renderingTypePredictor = renderingTypePredictor ?? new rendering_type_prediction_1.RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio }); this.resultChecker = resultChecker ?? (() => true); if (resultComparator !== undefined) { this.resultComparator = resultComparator; } else if (resultChecker !== undefined) { this.resultComparator = (resultA, resultB) => this.resultChecker(resultA) && this.resultChecker(resultB); } else { this.resultComparator = (resultA, resultB) => { return (resultA.datasetItems.length === resultB.datasetItems.length && resultA.datasetItems.every((itemA, i) => { const itemB = resultB.datasetItems[i]; return (0, lodash_isequal_1.default)(itemA, itemB); })); }; } this.stats = new AdaptivePlaywrightCrawlerStatistics({ logMessage: `${this.log.getOptions().prefix} request statistics:`, config, ...statisticsOptions, }); this.preventDirectStorageAccess = preventDirectStorageAccess; } /** * Returns the number of rendering type detections currently in progress. */ get inFlightRenderingTypeDetectionCount() { return this.inFlightRenderingTypeDetections; } async _init() { await this.renderingTypePredictor.initialize(); return await super._init(); } async _runRequestHandler(crawlingContext) { const renderingTypePrediction = this.renderingTypePredictor.predict(crawlingContext.request); const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation; if (shouldDetectRenderingType) { this.inFlightRenderingTypeDetections++; } try { if (!shouldDetectRenderingType) { crawlingContext.log.debug(`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`); } if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) { crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`); this.stats.trackHttpOnlyRequestHandlerRun(); const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext); if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) { crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`); plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...args)); await this.commitResult(crawlingContext, plainHTTPRun.result); return; } if (!plainHTTPRun.ok) { crawlingContext.log.exception(plainHTTPRun.error, `HTTP-only request handler failed for ${crawlingContext.request.url}`); } else { crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`); this.stats.trackRenderingTypeMisprediction(); } } crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`); this.stats.trackBrowserRequestHandlerRun(); // Run the request handler in a browser. The copy of the crawler state is kept so that we can perform // a rendering type detection if necessary. Without this measure, the HTTP request handler would run // under different conditions, which could change its behavior. Changes done to the crawler state by // the HTTP request handler will not be committed to the actual storage. const { result: browserRun, initialStateCopy } = await this.runRequestHandlerInBrowser(crawlingContext); if (!browserRun.ok) { throw browserRun.error; } await this.commitResult(crawlingContext, browserRun.result); if (shouldDetectRenderingType) { crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`); const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, initialStateCopy); const detectionResult = (() => { if (!plainHTTPRun.ok) { return 'clientOnly'; } const comparisonResult = this.resultComparator(plainHTTPRun.result, browserRun.result); if (comparisonResult === true || comparisonResult === 'equal') { return 'static'; } if (comparisonResult === false || comparisonResult === 'different') { return 'clientOnly'; } return undefined; })(); crawlingContext.log.debug(`Detected rendering type ${detectionResult} for ${crawlingContext.request.url}`); if (detectionResult !== undefined) { this.renderingTypePredictor.storeResult(crawlingContext.request, detectionResult); } } } finally { if (shouldDetectRenderingType) { this.inFlightRenderingTypeDetections--; } } } async commitResult(crawlingContext, { calls, keyValueStoreChanges }) { await Promise.all([ ...calls.pushData.map(async (params) => crawlingContext.pushData(...params)), ...calls.addRequests.map(async (params) => crawlingContext.addRequests(...params)), ...Object.entries(keyValueStoreChanges).map(async ([storeIdOrName, changes]) => { const store = await crawlingContext.getKeyValueStore(storeIdOrName); await Promise.all(Object.entries(changes).map(async ([key, { changedValue, options }]) => store.setValue(key, changedValue, options))); }), ]); } allowStorageAccess(func) { return async (...args) => (0, core_1.withCheckedStorageAccess)(() => { }, async () => func(...args)); } async runRequestHandlerInBrowser(crawlingContext) { const result = new core_1.RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); let initialStateCopy; try { await super._runRequestHandler.call(new Proxy(this, { get: (target, propertyName, receiver) => { if (propertyName === 'userProvidedRequestHandler') { return async (playwrightContext) => (0, core_1.withCheckedStorageAccess)(() => { if (this.preventDirectStorageAccess) { throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler'); } }, () => this.adaptiveRequestHandler({ id: crawlingContext.id, session: crawlingContext.session, proxyInfo: crawlingContext.proxyInfo, request: crawlingContext.request, response: { url: crawlingContext.response.url(), statusCode: crawlingContext.response.status(), headers: crawlingContext.response.headers(), trailers: {}, complete: true, redirectUrls: [], }, log: crawlingContext.log, page: crawlingContext.page, querySelector: async (selector, timeoutMs = 5000) => { const locator = playwrightContext.page.locator(selector).first(); await locator.waitFor({ timeout: timeoutMs, state: 'attached' }); const $ = await playwrightContext.parseWithCheerio(); return $(selector); }, async waitForSelector(selector, timeoutMs = 5000) { const locator = playwrightContext.page.locator(selector).first(); await locator.waitFor({ timeout: timeoutMs, state: 'attached' }); }, async parseWithCheerio(selector, timeoutMs = 5000) { if (selector) { const locator = playwrightContext.page.locator(selector).first(); await locator.waitFor({ timeout: timeoutMs, state: 'attached' }); } return playwrightContext.parseWithCheerio(); }, enqueueLinks: async (options = {}, timeoutMs = 5000) => { let urls; if (options.urls === undefined) { const selector = options.selector ?? 'a'; const locator = playwrightContext.page.locator(selector).first(); await locator.waitFor({ timeout: timeoutMs, state: 'attached' }); urls = await (0, browser_1.extractUrlsFromPage)(playwrightContext.page, selector, options.baseUrl ?? playwrightContext.request.loadedUrl ?? playwrightContext.request.url); } else { urls = options.urls; } return await this.enqueueLinks({ ...options, urls }, crawlingContext.request, result); }, addRequests: result.addRequests, pushData: result.pushData, useState: this.allowStorageAccess(async (defaultValue) => { const state = await result.useState(defaultValue); if (initialStateCopy === undefined) { initialStateCopy = JSON.parse(JSON.stringify(state)); } return state; }), getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), })); } return Reflect.get(target, propertyName, receiver); }, }), crawlingContext); return { result: { result, ok: true }, initialStateCopy }; } catch (error) { return { result: { error, ok: false }, initialStateCopy }; } } async runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy) { const result = new core_1.RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); const logs = []; const pageGotoOptions = { timeout: this.navigationTimeoutMillis }; // Irrelevant, but required by BrowserCrawler try { await (0, core_1.withCheckedStorageAccess)(() => { if (this.preventDirectStorageAccess) { throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler'); } }, async () => (0, timeout_1.addTimeoutToPromise)(async () => { const hookContext = { id: crawlingContext.id, session: crawlingContext.session, proxyInfo: crawlingContext.proxyInfo, request: crawlingContext.request, log: this.createLogProxy(crawlingContext.log, logs), }; await this._executeHooks(this.preNavigationHooks, { ...hookContext, get page() { throw new Error('Page object was used in HTTP-only pre-navigation hook'); }, }, // This is safe because `executeHooks` just passes the context to the hooks which accept the partial context pageGotoOptions); const response = await crawlingContext.sendRequest({}); const loadedUrl = response.url; crawlingContext.request.loadedUrl = loadedUrl; if (!this.requestMatchesEnqueueStrategy(crawlingContext.request)) { const request = crawlingContext.request; this.log.debug( // eslint-disable-next-line dot-notation `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`); request.noRetry = true; request.state = core_1.RequestState.SKIPPED; await this.handleSkippedRequest({ url: request.url, reason: 'redirect' }); return; } const $ = (0, cheerio_1.load)(response.body); await this.adaptiveRequestHandler({ ...hookContext, request: crawlingContext.request, response, get page() { throw new Error('Page object was used in HTTP-only request handler'); }, async querySelector(selector, _timeoutMs) { return $(selector); }, async waitForSelector(selector, _timeoutMs) { if ($(selector).get().length === 0) { throw new Error(`Selector '${selector}' not found.`); } }, async parseWithCheerio(selector, _timeoutMs) { if (selector && $(selector).get().length === 0) { throw new Error(`Selector '${selector}' not found.`); } return $; }, enqueueLinks: async (options = {}) => { const urls = options.urls ?? (0, utils_1.extractUrlsFromCheerio)($, options.selector, options.baseUrl ?? loadedUrl); return this.enqueueLinks({ ...options, urls }, crawlingContext.request, result); }, addRequests: result.addRequests, pushData: result.pushData, useState: async (defaultValue) => { // return the old state before the browser handler was executed // when rerunning the handler via HTTP for detection if (oldStateCopy !== undefined) { return oldStateCopy ?? defaultValue; // fallback to the default for `null` } return this.allowStorageAccess(result.useState)(defaultValue); }, getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), }); await this._executeHooks(this.postNavigationHooks, crawlingContext, pageGotoOptions); }, this.requestHandlerTimeoutInnerMillis, 'Request handler timed out')); return { result, logs, ok: true }; } catch (error) { return { error, logs, ok: false }; } } async enqueueLinks(options, request, result) { const baseUrl = (0, core_1.resolveBaseUrlForEnqueueLinksFiltering)({ enqueueStrategy: options?.strategy, finalRequestUrl: request.loadedUrl, originalRequestUrl: request.url, userProvidedBaseUrl: options?.baseUrl, }); const addRequestsBatched = async (requests) => { await result.addRequests(requests); return { addedRequests: requests.map(({ uniqueKey, id }) => ({ uniqueKey, requestId: id ?? '', wasAlreadyPresent: false, wasAlreadyHandled: false, })), waitForAllRequestsToBeAdded: Promise.resolve([]), }; }; // We need to use a mock request queue implementation, in order to add the requests into our result object const mockRequestQueue = { addRequestsBatched }; return await this.enqueueLinksWithCrawlDepth({ ...options, baseUrl }, request, mockRequestQueue); } createLogProxy(log, logs) { return new Proxy(log, { get(target, propertyName, receiver) { if (proxyLogMethods.includes(propertyName)) { return (...args) => { logs.push([target, propertyName, ...args]); }; } return Reflect.get(target, propertyName, receiver); }, }); } } exports.AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler; function createAdaptivePlaywrightRouter(routes) { return core_1.Router.create(routes); } //# sourceMappingURL=adaptive-playwright-crawler.js.map