@crawlee/playwright
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
472 lines • 23.7 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.AdaptivePlaywrightCrawler = void 0;
exports.createAdaptivePlaywrightRouter = createAdaptivePlaywrightRouter;
const tslib_1 = require("tslib");
const browser_1 = require("@crawlee/browser");
const core_1 = require("@crawlee/core");
const utils_1 = require("@crawlee/utils");
const cheerio_1 = require("cheerio");
const lodash_isequal_1 = tslib_1.__importDefault(require("lodash.isequal"));
const timeout_1 = require("@apify/timeout");
const playwright_crawler_1 = require("./playwright-crawler");
const rendering_type_prediction_1 = require("./utils/rendering-type-prediction");
class AdaptivePlaywrightCrawlerStatistics extends core_1.Statistics {
constructor(options = {}) {
super(options);
Object.defineProperty(this, "state", {
enumerable: true,
configurable: true,
writable: true,
value: null
}); // this needs to be assigned for a valid override, but the initialization is done by a reset() call from the parent constructor
this.reset();
}
reset() {
super.reset();
this.state.httpOnlyRequestHandlerRuns = 0;
this.state.browserRequestHandlerRuns = 0;
this.state.renderingTypeMispredictions = 0;
}
async _maybeLoadStatistics() {
await super._maybeLoadStatistics();
const savedState = await this.keyValueStore?.getValue(this.persistStateKey);
if (!savedState) {
return;
}
this.state.httpOnlyRequestHandlerRuns = savedState.httpOnlyRequestHandlerRuns;
this.state.browserRequestHandlerRuns = savedState.browserRequestHandlerRuns;
this.state.renderingTypeMispredictions = savedState.renderingTypeMispredictions;
}
trackHttpOnlyRequestHandlerRun() {
var _a;
(_a = this.state).httpOnlyRequestHandlerRuns ?? (_a.httpOnlyRequestHandlerRuns = 0);
this.state.httpOnlyRequestHandlerRuns += 1;
}
trackBrowserRequestHandlerRun() {
var _a;
(_a = this.state).browserRequestHandlerRuns ?? (_a.browserRequestHandlerRuns = 0);
this.state.browserRequestHandlerRuns += 1;
}
trackRenderingTypeMisprediction() {
var _a;
(_a = this.state).renderingTypeMispredictions ?? (_a.renderingTypeMispredictions = 0);
this.state.renderingTypeMispredictions += 1;
}
}
const proxyLogMethods = [
'error',
'exception',
'softFail',
'info',
'debug',
'perf',
'warningOnce',
'deprecated',
];
/**
* An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible.
*
* **Example usage:**
*
* ```javascript
* const crawler = new AdaptivePlaywrightCrawler({
* renderingTypeDetectionRatio: 0.1,
* async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) {
* // This function is called to extract data from a single web page
* const $prices = await querySelector('span.price')
*
* await pushData({
* url: request.url,
* price: $prices.filter(':contains("$")').first().text(),
* })
*
* await enqueueLinks({ selector: '.pagination a' })
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
*
* @experimental
*/
class AdaptivePlaywrightCrawler extends playwright_crawler_1.PlaywrightCrawler {
constructor(options = {}, config = core_1.Configuration.getGlobalConfig()) {
const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, ...rest } = options;
super(rest, config);
Object.defineProperty(this, "config", {
enumerable: true,
configurable: true,
writable: true,
value: config
});
Object.defineProperty(this, "adaptiveRequestHandler", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "renderingTypePredictor", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "resultChecker", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "resultComparator", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "preventDirectStorageAccess", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "inFlightRenderingTypeDetections", {
enumerable: true,
configurable: true,
writable: true,
value: 0
});
/**
* Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
*/
// @ts-ignore
Object.defineProperty(this, "router", {
enumerable: true,
configurable: true,
writable: true,
value: core_1.Router.create()
});
this.adaptiveRequestHandler = requestHandler ?? this.router;
this.renderingTypePredictor =
renderingTypePredictor ?? new rendering_type_prediction_1.RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio });
this.resultChecker = resultChecker ?? (() => true);
if (resultComparator !== undefined) {
this.resultComparator = resultComparator;
}
else if (resultChecker !== undefined) {
this.resultComparator = (resultA, resultB) => this.resultChecker(resultA) && this.resultChecker(resultB);
}
else {
this.resultComparator = (resultA, resultB) => {
return (resultA.datasetItems.length === resultB.datasetItems.length &&
resultA.datasetItems.every((itemA, i) => {
const itemB = resultB.datasetItems[i];
return (0, lodash_isequal_1.default)(itemA, itemB);
}));
};
}
this.stats = new AdaptivePlaywrightCrawlerStatistics({
logMessage: `${this.log.getOptions().prefix} request statistics:`,
config,
...statisticsOptions,
});
this.preventDirectStorageAccess = preventDirectStorageAccess;
}
/**
* Returns the number of rendering type detections currently in progress.
*/
get inFlightRenderingTypeDetectionCount() {
return this.inFlightRenderingTypeDetections;
}
async _init() {
await this.renderingTypePredictor.initialize();
return await super._init();
}
async _runRequestHandler(crawlingContext) {
const renderingTypePrediction = this.renderingTypePredictor.predict(crawlingContext.request);
const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation;
if (shouldDetectRenderingType) {
this.inFlightRenderingTypeDetections++;
}
try {
if (!shouldDetectRenderingType) {
crawlingContext.log.debug(`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`);
}
if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) {
crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`);
this.stats.trackHttpOnlyRequestHandlerRun();
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext);
if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) {
crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`);
plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...args));
await this.commitResult(crawlingContext, plainHTTPRun.result);
return;
}
if (!plainHTTPRun.ok) {
crawlingContext.log.exception(plainHTTPRun.error, `HTTP-only request handler failed for ${crawlingContext.request.url}`);
}
else {
crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`);
this.stats.trackRenderingTypeMisprediction();
}
}
crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`);
this.stats.trackBrowserRequestHandlerRun();
// Run the request handler in a browser. The copy of the crawler state is kept so that we can perform
// a rendering type detection if necessary. Without this measure, the HTTP request handler would run
// under different conditions, which could change its behavior. Changes done to the crawler state by
// the HTTP request handler will not be committed to the actual storage.
const { result: browserRun, initialStateCopy } = await this.runRequestHandlerInBrowser(crawlingContext);
if (!browserRun.ok) {
throw browserRun.error;
}
await this.commitResult(crawlingContext, browserRun.result);
if (shouldDetectRenderingType) {
crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, initialStateCopy);
const detectionResult = (() => {
if (!plainHTTPRun.ok) {
return 'clientOnly';
}
const comparisonResult = this.resultComparator(plainHTTPRun.result, browserRun.result);
if (comparisonResult === true || comparisonResult === 'equal') {
return 'static';
}
if (comparisonResult === false || comparisonResult === 'different') {
return 'clientOnly';
}
return undefined;
})();
crawlingContext.log.debug(`Detected rendering type ${detectionResult} for ${crawlingContext.request.url}`);
if (detectionResult !== undefined) {
this.renderingTypePredictor.storeResult(crawlingContext.request, detectionResult);
}
}
}
finally {
if (shouldDetectRenderingType) {
this.inFlightRenderingTypeDetections--;
}
}
}
async commitResult(crawlingContext, { calls, keyValueStoreChanges }) {
await Promise.all([
...calls.pushData.map(async (params) => crawlingContext.pushData(...params)),
...calls.addRequests.map(async (params) => crawlingContext.addRequests(...params)),
...Object.entries(keyValueStoreChanges).map(async ([storeIdOrName, changes]) => {
const store = await crawlingContext.getKeyValueStore(storeIdOrName);
await Promise.all(Object.entries(changes).map(async ([key, { changedValue, options }]) => store.setValue(key, changedValue, options)));
}),
]);
}
allowStorageAccess(func) {
return async (...args) => (0, core_1.withCheckedStorageAccess)(() => { }, async () => func(...args));
}
async runRequestHandlerInBrowser(crawlingContext) {
const result = new core_1.RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
let initialStateCopy;
try {
await super._runRequestHandler.call(new Proxy(this, {
get: (target, propertyName, receiver) => {
if (propertyName === 'userProvidedRequestHandler') {
return async (playwrightContext) => (0, core_1.withCheckedStorageAccess)(() => {
if (this.preventDirectStorageAccess) {
throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
}
}, () => this.adaptiveRequestHandler({
id: crawlingContext.id,
session: crawlingContext.session,
proxyInfo: crawlingContext.proxyInfo,
request: crawlingContext.request,
response: {
url: crawlingContext.response.url(),
statusCode: crawlingContext.response.status(),
headers: crawlingContext.response.headers(),
trailers: {},
complete: true,
redirectUrls: [],
},
log: crawlingContext.log,
page: crawlingContext.page,
querySelector: async (selector, timeoutMs = 5000) => {
const locator = playwrightContext.page.locator(selector).first();
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
const $ = await playwrightContext.parseWithCheerio();
return $(selector);
},
async waitForSelector(selector, timeoutMs = 5000) {
const locator = playwrightContext.page.locator(selector).first();
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
},
async parseWithCheerio(selector, timeoutMs = 5000) {
if (selector) {
const locator = playwrightContext.page.locator(selector).first();
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
}
return playwrightContext.parseWithCheerio();
},
enqueueLinks: async (options = {}, timeoutMs = 5000) => {
let urls;
if (options.urls === undefined) {
const selector = options.selector ?? 'a';
const locator = playwrightContext.page.locator(selector).first();
await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
urls = await (0, browser_1.extractUrlsFromPage)(playwrightContext.page, selector, options.baseUrl ??
playwrightContext.request.loadedUrl ??
playwrightContext.request.url);
}
else {
urls = options.urls;
}
return await this.enqueueLinks({ ...options, urls }, crawlingContext.request, result);
},
addRequests: result.addRequests,
pushData: result.pushData,
useState: this.allowStorageAccess(async (defaultValue) => {
const state = await result.useState(defaultValue);
if (initialStateCopy === undefined) {
initialStateCopy = JSON.parse(JSON.stringify(state));
}
return state;
}),
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
}));
}
return Reflect.get(target, propertyName, receiver);
},
}), crawlingContext);
return { result: { result, ok: true }, initialStateCopy };
}
catch (error) {
return { result: { error, ok: false }, initialStateCopy };
}
}
async runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy) {
const result = new core_1.RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
const logs = [];
const pageGotoOptions = { timeout: this.navigationTimeoutMillis }; // Irrelevant, but required by BrowserCrawler
try {
await (0, core_1.withCheckedStorageAccess)(() => {
if (this.preventDirectStorageAccess) {
throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
}
}, async () => (0, timeout_1.addTimeoutToPromise)(async () => {
const hookContext = {
id: crawlingContext.id,
session: crawlingContext.session,
proxyInfo: crawlingContext.proxyInfo,
request: crawlingContext.request,
log: this.createLogProxy(crawlingContext.log, logs),
};
await this._executeHooks(this.preNavigationHooks, {
...hookContext,
get page() {
throw new Error('Page object was used in HTTP-only pre-navigation hook');
},
}, // This is safe because `executeHooks` just passes the context to the hooks which accept the partial context
pageGotoOptions);
const response = await crawlingContext.sendRequest({});
const loadedUrl = response.url;
crawlingContext.request.loadedUrl = loadedUrl;
if (!this.requestMatchesEnqueueStrategy(crawlingContext.request)) {
const request = crawlingContext.request;
this.log.debug(
// eslint-disable-next-line dot-notation
`Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
request.noRetry = true;
request.state = core_1.RequestState.SKIPPED;
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
return;
}
const $ = (0, cheerio_1.load)(response.body);
await this.adaptiveRequestHandler({
...hookContext,
request: crawlingContext.request,
response,
get page() {
throw new Error('Page object was used in HTTP-only request handler');
},
async querySelector(selector, _timeoutMs) {
return $(selector);
},
async waitForSelector(selector, _timeoutMs) {
if ($(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}
},
async parseWithCheerio(selector, _timeoutMs) {
if (selector && $(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}
return $;
},
enqueueLinks: async (options = {}) => {
const urls = options.urls ??
(0, utils_1.extractUrlsFromCheerio)($, options.selector, options.baseUrl ?? loadedUrl);
return this.enqueueLinks({ ...options, urls }, crawlingContext.request, result);
},
addRequests: result.addRequests,
pushData: result.pushData,
useState: async (defaultValue) => {
// return the old state before the browser handler was executed
// when rerunning the handler via HTTP for detection
if (oldStateCopy !== undefined) {
return oldStateCopy ?? defaultValue; // fallback to the default for `null`
}
return this.allowStorageAccess(result.useState)(defaultValue);
},
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
});
await this._executeHooks(this.postNavigationHooks, crawlingContext, pageGotoOptions);
}, this.requestHandlerTimeoutInnerMillis, 'Request handler timed out'));
return { result, logs, ok: true };
}
catch (error) {
return { error, logs, ok: false };
}
}
async enqueueLinks(options, request, result) {
const baseUrl = (0, core_1.resolveBaseUrlForEnqueueLinksFiltering)({
enqueueStrategy: options?.strategy,
finalRequestUrl: request.loadedUrl,
originalRequestUrl: request.url,
userProvidedBaseUrl: options?.baseUrl,
});
const addRequestsBatched = async (requests) => {
await result.addRequests(requests);
return {
addedRequests: requests.map(({ uniqueKey, id }) => ({
uniqueKey,
requestId: id ?? '',
wasAlreadyPresent: false,
wasAlreadyHandled: false,
})),
waitForAllRequestsToBeAdded: Promise.resolve([]),
};
};
// We need to use a mock request queue implementation, in order to add the requests into our result object
const mockRequestQueue = { addRequestsBatched };
return await this.enqueueLinksWithCrawlDepth({ ...options, baseUrl }, request, mockRequestQueue);
}
createLogProxy(log, logs) {
return new Proxy(log, {
get(target, propertyName, receiver) {
if (proxyLogMethods.includes(propertyName)) {
return (...args) => {
logs.push([target, propertyName, ...args]);
};
}
return Reflect.get(target, propertyName, receiver);
},
});
}
}
exports.AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler;
function createAdaptivePlaywrightRouter(routes) {
return core_1.Router.create(routes);
}
//# sourceMappingURL=adaptive-playwright-crawler.js.map