UNPKG

@crawlee/playwright

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

145 lines • 6.46 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RenderingTypePredictor = void 0; const tslib_1 = require("tslib"); const core_1 = require("@crawlee/core"); const ml_logistic_regression_1 = tslib_1.__importDefault(require("ml-logistic-regression")); const ml_matrix_1 = require("ml-matrix"); const string_comparison_1 = tslib_1.__importDefault(require("string-comparison")); const urlComponents = (url) => { return [url.hostname, ...url.pathname.split('/')]; }; const calculateUrlSimilarity = (a, b) => { const values = []; if (a[0] !== b[0]) { return 0; } for (let i = 1; i < Math.max(a.length, b.length); i++) { values.push(string_comparison_1.default.jaroWinkler.similarity(a[i] ?? '', b[i] ?? '') > 0.8 ? 1 : 0); } return sum(values) / Math.max(a.length, b.length); }; const sum = (values) => values.reduce((acc, value) => acc + value); const mean = (values) => (values.length > 0 ? sum(values) / values.length : undefined); /** * Stores rendering type information for previously crawled URLs and predicts the rendering type for URLs that have yet to be crawled and recommends when rendering type detection should be performed. * * @experimental */ class RenderingTypePredictor { constructor({ detectionRatio, persistenceOptions }) { Object.defineProperty(this, "detectionRatio", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "state", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.detectionRatio = detectionRatio; this.state = new core_1.RecoverableState({ defaultState: { logreg: new ml_logistic_regression_1.default({ numSteps: 1000, learningRate: 0.05 }), detectionResults: new Map(), }, serialize: (state) => JSON.stringify({ logreg: state.logreg.toJSON(), detectionResults: Array.from(state.detectionResults.entries()).map(([renderingType, urlPartsByLabel]) => ({ renderingType, urlPartsByLabel: Array.from(urlPartsByLabel.entries()).map(([label, urlParts]) => ({ label, urlParts, })), })), }), deserialize: (serializedState) => { const { logreg, detectionResults = [] } = JSON.parse(serializedState); return { logreg: ml_logistic_regression_1.default.load(logreg), detectionResults: new Map(detectionResults.map((serializedItem) => [ serializedItem.renderingType, new Map(serializedItem.urlPartsByLabel.map((item) => [item.label, item.urlParts])), ])), }; }, persistStateKey: 'rendering-type-predictor-state', persistenceEnabled: true, ...persistenceOptions, }); } /** * Initialize the predictor by restoring persisted state. */ async initialize() { await this.state.initialize(); } /** * Predict the rendering type for a given URL and request label. */ predict({ url, loadedUrl, label }) { const { logreg } = this.state.currentValue; if (logreg.classifiers.length === 0) { return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 }; } const predictionUrl = new URL(loadedUrl ?? url); const urlFeature = new ml_matrix_1.Matrix([this.calculateFeatureVector(urlComponents(predictionUrl), label)]); const [prediction] = logreg.predict(urlFeature); const scores = [logreg.classifiers[0].testScores(urlFeature), logreg.classifiers[1].testScores(urlFeature)]; return { renderingType: prediction === 1 ? 'static' : 'clientOnly', detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1 ? 1 : this.detectionRatio * Math.max(1, 5 - this.resultCount(label)), }; } /** * Store the rendering type for a given URL and request label. This updates the underlying prediction model, which may be costly. */ storeResult(requests, renderingType) { const state = this.state.currentValue; for (const { url, loadedUrl, label } of Array.isArray(requests) ? requests : [requests]) { const resultUrl = new URL(loadedUrl ?? url); if (!state.detectionResults.has(renderingType)) { state.detectionResults.set(renderingType, new Map()); } if (!state.detectionResults.get(renderingType).has(label)) { state.detectionResults.get(renderingType).set(label, []); } state.detectionResults.get(renderingType).get(label).push(urlComponents(resultUrl)); } this.retrain(); } resultCount(label) { return Array.from(this.state.currentValue.detectionResults.values()) .map((results) => results.get(label)?.length ?? 0) .reduce((acc, value) => acc + value, 0); } calculateFeatureVector(url, label) { return [ mean((this.state.currentValue.detectionResults.get('static')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0, mean((this.state.currentValue.detectionResults.get('clientOnly')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0, ]; } retrain() { const X = [ [0, 1], [1, 0], ]; const Y = [0, 1]; for (const [renderingType, urlsByLabel] of this.state.currentValue.detectionResults.entries()) { for (const [label, urls] of urlsByLabel) { for (const url of urls) { X.push(this.calculateFeatureVector(url, label)); Y.push(renderingType === 'static' ? 1 : 0); } } } this.state.currentValue.logreg.train(new ml_matrix_1.Matrix(X), ml_matrix_1.Matrix.columnVector(Y)); } } exports.RenderingTypePredictor = RenderingTypePredictor; //# sourceMappingURL=rendering-type-prediction.js.map