@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
335 lines • 12.5 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Session = void 0;
const tslib_1 = require("tslib");
const node_events_1 = require("node:events");
const ow_1 = tslib_1.__importDefault(require("ow"));
const tough_cookie_1 = require("tough-cookie");
const utilities_1 = require("@apify/utilities");
const cookie_utils_1 = require("../cookie_utils");
const log_1 = require("../log");
const events_1 = require("./events");
/**
* Sessions are used to store information such as cookies and can be used for generating fingerprints and proxy sessions.
* You can imagine each session as a specific user, with its own cookies, IP (via proxy) and potentially a unique browser fingerprint.
* Session internal state can be enriched with custom user data for example some authorization tokens and specific headers in general.
* @category Scaling
*/
class Session {
get errorScore() {
return this._errorScore;
}
get usageCount() {
return this._usageCount;
}
get maxErrorScore() {
return this._maxErrorScore;
}
get errorScoreDecrement() {
return this._errorScoreDecrement;
}
get expiresAt() {
return this._expiresAt;
}
get createdAt() {
return this._createdAt;
}
get maxUsageCount() {
return this._maxUsageCount;
}
get cookieJar() {
return this._cookieJar;
}
/**
* Session configuration.
*/
constructor(options) {
Object.defineProperty(this, "id", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxAgeSecs", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "userData", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_maxErrorScore", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_errorScoreDecrement", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_createdAt", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_expiresAt", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_usageCount", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_maxUsageCount", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "sessionPool", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_errorScore", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_cookieJar", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "log", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
(0, ow_1.default)(options, ow_1.default.object.exactShape({
sessionPool: ow_1.default.object.instanceOf(node_events_1.EventEmitter),
id: ow_1.default.optional.string,
cookieJar: ow_1.default.optional.object,
maxAgeSecs: ow_1.default.optional.number,
userData: ow_1.default.optional.object,
maxErrorScore: ow_1.default.optional.number,
errorScoreDecrement: ow_1.default.optional.number,
createdAt: ow_1.default.optional.date,
expiresAt: ow_1.default.optional.date,
usageCount: ow_1.default.optional.number,
errorScore: ow_1.default.optional.number,
maxUsageCount: ow_1.default.optional.number,
log: ow_1.default.optional.object,
}));
const { sessionPool, id = `session_${(0, utilities_1.cryptoRandomObjectId)(10)}`, cookieJar = new tough_cookie_1.CookieJar(), maxAgeSecs = 3000, userData = {}, maxErrorScore = 3, errorScoreDecrement = 0.5, createdAt = new Date(), usageCount = 0, errorScore = 0, maxUsageCount = 50, log = log_1.log, } = options;
const { expiresAt = (0, cookie_utils_1.getDefaultCookieExpirationDate)(maxAgeSecs) } = options;
this.log = log.child({ prefix: 'Session' });
this._cookieJar = cookieJar.setCookie ? cookieJar : tough_cookie_1.CookieJar.fromJSON(JSON.stringify(cookieJar));
this.id = id;
this.maxAgeSecs = maxAgeSecs;
this.userData = userData;
this._maxErrorScore = maxErrorScore;
this._errorScoreDecrement = errorScoreDecrement;
// Internal
this._expiresAt = expiresAt;
this._createdAt = createdAt;
this._usageCount = usageCount; // indicates how many times the session has been used
this._errorScore = errorScore; // indicates number of markBaded request with the session
this._maxUsageCount = maxUsageCount;
this.sessionPool = sessionPool;
}
/**
* Indicates whether the session is blocked.
* Session is blocked once it reaches the `maxErrorScore`.
*/
isBlocked() {
return this.errorScore >= this.maxErrorScore;
}
/**
* Indicates whether the session is expired.
* Session expiration is determined by the `maxAgeSecs`.
* Once the session is older than `createdAt + maxAgeSecs` the session is considered expired.
*/
isExpired() {
return this.expiresAt <= new Date();
}
/**
* Indicates whether the session is used maximum number of times.
* Session maximum usage count can be changed by `maxUsageCount` parameter.
*/
isMaxUsageCountReached() {
return this.usageCount >= this.maxUsageCount;
}
/**
* Indicates whether the session can be used for next requests.
* Session is usable when it is not expired, not blocked and the maximum usage count has not be reached.
*/
isUsable() {
return !this.isBlocked() && !this.isExpired() && !this.isMaxUsageCountReached();
}
/**
* This method should be called after a successful session usage.
* It increases `usageCount` and potentially lowers the `errorScore` by the `errorScoreDecrement`.
*/
markGood() {
this._usageCount += 1;
if (this._errorScore > 0) {
this._errorScore -= this._errorScoreDecrement;
}
this._maybeSelfRetire();
}
/**
* Gets session state for persistence in KeyValueStore.
* @returns Represents session internal state.
*/
getState() {
return {
id: this.id,
cookieJar: this.cookieJar.toJSON(),
userData: this.userData,
maxErrorScore: this.maxErrorScore,
errorScoreDecrement: this.errorScoreDecrement,
expiresAt: this.expiresAt.toISOString(),
createdAt: this.createdAt.toISOString(),
usageCount: this.usageCount,
maxUsageCount: this.maxUsageCount,
errorScore: this.errorScore,
};
}
/**
* Marks session as blocked and emits event on the `SessionPool`
* This method should be used if the session usage was unsuccessful
* and you are sure that it is because of the session configuration and not any external matters.
* For example when server returns 403 status code.
* If the session does not work due to some external factors as server error such as 5XX you probably want to use `markBad` method.
*/
retire() {
// mark it as an invalid by increasing the error score count.
this._errorScore += this._maxErrorScore;
this._usageCount += 1;
// emit event so we can retire browser in puppeteer pool
this.sessionPool.emit(events_1.EVENT_SESSION_RETIRED, this);
}
/**
* Increases usage and error count.
* Should be used when the session has been used unsuccessfully. For example because of timeouts.
*/
markBad() {
this._errorScore += 1;
this._usageCount += 1;
this._maybeSelfRetire();
}
retireOnBlockedStatusCodes(statusCode, additionalBlockedStatusCodes = []) {
// eslint-disable-next-line dot-notation -- accessing private property
const isBlocked = this.sessionPool['blockedStatusCodes']
.concat(additionalBlockedStatusCodes)
.includes(statusCode);
if (isBlocked) {
this.retire();
}
return isBlocked;
}
/**
* Saves cookies from an HTTP response to be used with the session.
* It expects an object with a `headers` property that's either an `Object`
* (typical Node.js responses) or a `Function` (Puppeteer Response).
*
* It then parses and saves the cookies from the `set-cookie` header, if available.
*/
setCookiesFromResponse(response) {
try {
const cookies = (0, cookie_utils_1.getCookiesFromResponse)(response).filter((c) => c);
this._setCookies(cookies, typeof response.url === 'function' ? response.url() : response.url);
}
catch (e) {
const err = e;
// if invalid Cookie header is provided just log the exception.
this.log.exception(err, 'Could not get cookies from response');
}
}
/**
* Saves an array with cookie objects to be used with the session.
* The objects should be in the format that
* [Puppeteer uses](https://pptr.dev/#?product=Puppeteer&version=v2.0.0&show=api-pagecookiesurls),
* but you can also use this function to set cookies manually:
*
* ```
* [
* { name: 'cookie1', value: 'my-cookie' },
* { name: 'cookie2', value: 'your-cookie' }
* ]
* ```
*/
setCookies(cookies, url) {
const normalizedCookies = cookies.map((c) => (0, cookie_utils_1.browserPoolCookieToToughCookie)(c, this.maxAgeSecs));
this._setCookies(normalizedCookies, url);
}
/**
* Returns cookies in a format compatible with puppeteer/playwright and ready to be used with `page.setCookie`.
* @param url website url. Only cookies stored for this url will be returned
*/
getCookies(url) {
const cookies = this.cookieJar.getCookiesSync(url);
return cookies.map((c) => (0, cookie_utils_1.toughCookieToBrowserPoolCookie)(c));
}
/**
* Returns cookies saved with the session in the typical
* key1=value1; key2=value2 format, ready to be used in
* a cookie header or elsewhere.
* @returns Represents `Cookie` header.
*/
getCookieString(url) {
return this.cookieJar.getCookieStringSync(url, {});
}
/**
* Sets a cookie within this session for the specific URL.
*/
setCookie(rawCookie, url) {
this.cookieJar.setCookieSync(rawCookie, url);
}
/**
* Sets cookies.
*/
_setCookies(cookies, url) {
const errorMessages = [];
for (const cookie of cookies) {
try {
this.cookieJar.setCookieSync(cookie, url, { ignoreError: false });
}
catch (e) {
const err = e;
errorMessages.push(err.message);
}
}
// if invalid cookies are provided just log the exception. No need to retry the request automatically.
if (errorMessages.length) {
this.log.debug('Could not set cookies.', { errorMessages });
}
}
/**
* Checks if session is not usable. if it is not retires the session.
*/
_maybeSelfRetire() {
if (!this.isUsable()) {
this.retire();
}
}
}
exports.Session = Session;
//# sourceMappingURL=session.js.map