@crawlee/playwright
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
145 lines • 6.46 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RenderingTypePredictor = void 0;
const tslib_1 = require("tslib");
const core_1 = require("@crawlee/core");
const ml_logistic_regression_1 = tslib_1.__importDefault(require("ml-logistic-regression"));
const ml_matrix_1 = require("ml-matrix");
const string_comparison_1 = tslib_1.__importDefault(require("string-comparison"));
const urlComponents = (url) => {
return [url.hostname, ...url.pathname.split('/')];
};
const calculateUrlSimilarity = (a, b) => {
const values = [];
if (a[0] !== b[0]) {
return 0;
}
for (let i = 1; i < Math.max(a.length, b.length); i++) {
values.push(string_comparison_1.default.jaroWinkler.similarity(a[i] ?? '', b[i] ?? '') > 0.8 ? 1 : 0);
}
return sum(values) / Math.max(a.length, b.length);
};
const sum = (values) => values.reduce((acc, value) => acc + value);
const mean = (values) => (values.length > 0 ? sum(values) / values.length : undefined);
/**
* Stores rendering type information for previously crawled URLs and predicts the rendering type for URLs that have yet to be crawled and recommends when rendering type detection should be performed.
*
* @experimental
*/
class RenderingTypePredictor {
constructor({ detectionRatio, persistenceOptions }) {
Object.defineProperty(this, "detectionRatio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "state", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.detectionRatio = detectionRatio;
this.state = new core_1.RecoverableState({
defaultState: {
logreg: new ml_logistic_regression_1.default({ numSteps: 1000, learningRate: 0.05 }),
detectionResults: new Map(),
},
serialize: (state) => JSON.stringify({
logreg: state.logreg.toJSON(),
detectionResults: Array.from(state.detectionResults.entries()).map(([renderingType, urlPartsByLabel]) => ({
renderingType,
urlPartsByLabel: Array.from(urlPartsByLabel.entries()).map(([label, urlParts]) => ({
label,
urlParts,
})),
})),
}),
deserialize: (serializedState) => {
const { logreg, detectionResults = [] } = JSON.parse(serializedState);
return {
logreg: ml_logistic_regression_1.default.load(logreg),
detectionResults: new Map(detectionResults.map((serializedItem) => [
serializedItem.renderingType,
new Map(serializedItem.urlPartsByLabel.map((item) => [item.label, item.urlParts])),
])),
};
},
persistStateKey: 'rendering-type-predictor-state',
persistenceEnabled: true,
...persistenceOptions,
});
}
/**
* Initialize the predictor by restoring persisted state.
*/
async initialize() {
await this.state.initialize();
}
/**
* Predict the rendering type for a given URL and request label.
*/
predict({ url, loadedUrl, label }) {
const { logreg } = this.state.currentValue;
if (logreg.classifiers.length === 0) {
return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 };
}
const predictionUrl = new URL(loadedUrl ?? url);
const urlFeature = new ml_matrix_1.Matrix([this.calculateFeatureVector(urlComponents(predictionUrl), label)]);
const [prediction] = logreg.predict(urlFeature);
const scores = [logreg.classifiers[0].testScores(urlFeature), logreg.classifiers[1].testScores(urlFeature)];
return {
renderingType: prediction === 1 ? 'static' : 'clientOnly',
detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1
? 1
: this.detectionRatio * Math.max(1, 5 - this.resultCount(label)),
};
}
/**
* Store the rendering type for a given URL and request label. This updates the underlying prediction model, which may be costly.
*/
storeResult(requests, renderingType) {
const state = this.state.currentValue;
for (const { url, loadedUrl, label } of Array.isArray(requests) ? requests : [requests]) {
const resultUrl = new URL(loadedUrl ?? url);
if (!state.detectionResults.has(renderingType)) {
state.detectionResults.set(renderingType, new Map());
}
if (!state.detectionResults.get(renderingType).has(label)) {
state.detectionResults.get(renderingType).set(label, []);
}
state.detectionResults.get(renderingType).get(label).push(urlComponents(resultUrl));
}
this.retrain();
}
resultCount(label) {
return Array.from(this.state.currentValue.detectionResults.values())
.map((results) => results.get(label)?.length ?? 0)
.reduce((acc, value) => acc + value, 0);
}
calculateFeatureVector(url, label) {
return [
mean((this.state.currentValue.detectionResults.get('static')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0,
mean((this.state.currentValue.detectionResults.get('clientOnly')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0,
];
}
retrain() {
const X = [
[0, 1],
[1, 0],
];
const Y = [0, 1];
for (const [renderingType, urlsByLabel] of this.state.currentValue.detectionResults.entries()) {
for (const [label, urls] of urlsByLabel) {
for (const url of urls) {
X.push(this.calculateFeatureVector(url, label));
Y.push(renderingType === 'static' ? 1 : 0);
}
}
}
this.state.currentValue.logreg.train(new ml_matrix_1.Matrix(X), ml_matrix_1.Matrix.columnVector(Y));
}
}
exports.RenderingTypePredictor = RenderingTypePredictor;
//# sourceMappingURL=rendering-type-prediction.js.map