@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
459 lines • 18 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.SessionPool = void 0;
const tslib_1 = require("tslib");
const node_events_1 = require("node:events");
const async_queue_1 = require("@sapphire/async-queue");
const ow_1 = tslib_1.__importDefault(require("ow"));
const configuration_1 = require("../configuration");
const log_1 = require("../log");
const key_value_store_1 = require("../storages/key_value_store");
const consts_1 = require("./consts");
const session_1 = require("./session");
/**
* Handles the rotation, creation and persistence of user-like sessions.
* Creates a pool of {@link Session} instances, that are randomly rotated.
* When some session is marked as blocked, it is removed and new one is created instead (the pool never returns an unusable session).
* Learn more in the {@doclink guides/session-management | Session management guide}.
*
* You can create one by calling the {@link SessionPool.open} function.
*
* Session pool is already integrated into crawlers, and it can significantly improve your scraper
* performance with just 2 lines of code.
*
* **Example usage:**
*
* ```javascript
* const crawler = new CheerioCrawler({
* useSessionPool: true,
* persistCookiesPerSession: true,
* // ...
* })
* ```
*
* You can configure the pool with many options. See the {@link SessionPoolOptions}.
* Session pool is by default persisted in default {@link KeyValueStore}.
* If you want to have one pool for all runs you have to specify
* {@link SessionPoolOptions.persistStateKeyValueStoreId}.
*
* **Advanced usage:**
*
* ```javascript
* const sessionPool = await SessionPool.open({
* maxPoolSize: 25,
* sessionOptions:{
* maxAgeSecs: 10,
* maxUsageCount: 150, // for example when you know that the site blocks after 150 requests.
* },
* persistStateKeyValueStoreId: 'my-key-value-store-for-sessions',
* persistStateKey: 'my-session-pool',
* });
*
* // Get random session from the pool
* const session1 = await sessionPool.getSession();
* const session2 = await sessionPool.getSession();
* const session3 = await sessionPool.getSession();
*
* // Now you can mark the session either failed or successful
*
* // Marks session as bad after unsuccessful usage -> it increases error count (soft retire)
* session1.markBad()
*
* // Marks as successful.
* session2.markGood()
*
* // Retires session -> session is removed from the pool
* session3.retire()
*
* ```
*
* **Default session allocation flow:*
* 1. Until the `SessionPool` reaches `maxPoolSize`, new sessions are created, provided to the user and added to the pool
* 2. Blocked/retired sessions stay in the pool but are never provided to the user
* 3. Once the pool is full (live plus blocked session count reaches `maxPoolSize`), a random session from the pool is provided.
* 4. If a blocked session would be picked, instead all blocked sessions are evicted from the pool and a new session is created and provided
*
* @category Scaling
*/
class SessionPool extends node_events_1.EventEmitter {
/**
* @internal
*/
constructor(options = {}, config = configuration_1.Configuration.getGlobalConfig()) {
super();
Object.defineProperty(this, "config", {
enumerable: true,
configurable: true,
writable: true,
value: config
});
Object.defineProperty(this, "log", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxPoolSize", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "createSessionFunction", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "keyValueStore", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "sessions", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "sessionMap", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
Object.defineProperty(this, "sessionOptions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "persistStateKeyValueStoreId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "persistStateKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_listener", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "events", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "blockedStatusCodes", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "persistenceOptions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "isInitialized", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
Object.defineProperty(this, "queue", {
enumerable: true,
configurable: true,
writable: true,
value: new async_queue_1.AsyncQueue()
});
(0, ow_1.default)(options, ow_1.default.object.exactShape({
maxPoolSize: ow_1.default.optional.number,
persistStateKeyValueStoreId: ow_1.default.optional.string,
persistStateKey: ow_1.default.optional.string,
createSessionFunction: ow_1.default.optional.function,
sessionOptions: ow_1.default.optional.object,
blockedStatusCodes: ow_1.default.optional.array.ofType(ow_1.default.number),
log: ow_1.default.optional.object,
persistenceOptions: ow_1.default.optional.object,
}));
const { maxPoolSize = consts_1.MAX_POOL_SIZE, persistStateKeyValueStoreId, persistStateKey = consts_1.PERSIST_STATE_KEY, createSessionFunction, sessionOptions = {}, blockedStatusCodes = consts_1.BLOCKED_STATUS_CODES, log = log_1.log, persistenceOptions = {
enable: true,
}, } = options;
this.config = config;
this.blockedStatusCodes = blockedStatusCodes;
this.events = config.getEventManager();
this.log = log.child({ prefix: 'SessionPool' });
this.persistenceOptions = persistenceOptions;
// Pool Configuration
this.maxPoolSize = maxPoolSize;
this.createSessionFunction = createSessionFunction || this._defaultCreateSessionFunction;
// Session configuration
this.sessionOptions = {
...sessionOptions,
// the log needs to propagate to createSessionFunction as in "new Session({ ...sessionPool.sessionOptions })"
// and can't go inside _defaultCreateSessionFunction
log: this.log,
};
// Session keyValueStore
this.persistStateKeyValueStoreId = persistStateKeyValueStoreId;
this.persistStateKey = persistStateKey;
}
/**
* Gets count of usable sessions in the pool.
*/
get usableSessionsCount() {
return this.sessions.filter((session) => session.isUsable()).length;
}
/**
* Gets count of retired sessions in the pool.
*/
get retiredSessionsCount() {
return this.sessions.filter((session) => !session.isUsable()).length;
}
/**
* Starts periodic state persistence and potentially loads SessionPool state from {@link KeyValueStore}.
* It is called automatically by the {@link SessionPool.open} function.
*/
async initialize() {
if (this.isInitialized) {
return;
}
this.keyValueStore = await key_value_store_1.KeyValueStore.open(this.persistStateKeyValueStoreId, { config: this.config });
if (!this.persistenceOptions.enable) {
this.isInitialized = true;
return;
}
if (!this.persistStateKeyValueStoreId) {
this.log.debug(`No 'persistStateKeyValueStoreId' options specified, this session pool's data has been saved in the KeyValueStore with the id: ${this.keyValueStore.id}`);
}
// in case of migration happened and SessionPool state should be restored from the keyValueStore.
await this._maybeLoadSessionPool();
this._listener = this.persistState.bind(this);
this.events.on("persistState" /* EventType.PERSIST_STATE */, this._listener);
this.isInitialized = true;
}
/**
* Adds a new session to the session pool. The pool automatically creates sessions up to the maximum size of the pool,
* but this allows you to add more sessions once the max pool size is reached.
* This also allows you to add session with overridden session options (e.g. with specific session id).
* @param [options] The configuration options for the session being added to the session pool.
*/
async addSession(options = {}) {
this._throwIfNotInitialized();
const { id } = options;
if (id) {
const sessionExists = this.sessionMap.has(id);
if (sessionExists) {
throw new Error(`Cannot add session with id '${id}' as it already exists in the pool`);
}
}
if (!this._hasSpaceForSession()) {
this._removeRetiredSessions();
}
const newSession = options instanceof session_1.Session ? options : await this.createSessionFunction(this, { sessionOptions: options });
this.log.debug(`Adding new Session - ${newSession.id}`);
this._addSession(newSession);
}
/**
* Gets session.
* If there is space for new session, it creates and returns new session.
* If the session pool is full, it picks a session from the pool,
* If the picked session is usable it is returned, otherwise it creates and returns a new one.
* @param [sessionId] If provided, it returns the usable session with this id, `undefined` otherwise.
*/
async getSession(sessionId) {
await this.queue.wait();
try {
this._throwIfNotInitialized();
if (sessionId) {
const session = this.sessionMap.get(sessionId);
if (session && session.isUsable())
return session;
return undefined;
}
if (this._hasSpaceForSession()) {
return await this._createSession();
}
const pickedSession = this._pickSession();
if (pickedSession.isUsable()) {
return pickedSession;
}
this._removeRetiredSessions();
return await this._createSession();
}
finally {
this.queue.shift();
}
}
/**
* @param options - Override the persistence options provided in the constructor
*/
async resetStore(options) {
if (!this.persistenceOptions.enable && !options?.enable) {
return;
}
await this.keyValueStore?.setValue(this.persistStateKey, null);
}
/**
* Returns an object representing the internal state of the `SessionPool` instance.
* Note that the object's fields can change in future releases.
*/
getState() {
return {
usableSessionsCount: this.usableSessionsCount,
retiredSessionsCount: this.retiredSessionsCount,
sessions: this.sessions.map((session) => session.getState()),
};
}
/**
* Persists the current state of the `SessionPool` into the default {@link KeyValueStore}.
* The state is persisted automatically in regular intervals.
* @param options - Override the persistence options provided in the constructor
*/
async persistState(options) {
if (!this.persistenceOptions.enable && !options?.enable) {
return;
}
this.log.debug('Persisting state', {
persistStateKeyValueStoreId: this.persistStateKeyValueStoreId,
persistStateKey: this.persistStateKey,
});
// use half the interval of `persistState` to avoid race conditions
const persistStateIntervalMillis = this.config.get('persistStateIntervalMillis');
const timeoutSecs = persistStateIntervalMillis / 2000;
await this.keyValueStore
.setValue(this.persistStateKey, this.getState(), {
timeoutSecs,
doNotRetryTimeouts: true,
})
.catch((error) => this.log.warning(`Failed to persist the session pool stats to ${this.persistStateKey}`, { error }));
}
/**
* Removes listener from `persistState` event.
* This function should be called after you are done with using the `SessionPool` instance.
*/
async teardown() {
this.events.off("persistState" /* EventType.PERSIST_STATE */, this._listener);
await this.persistState();
}
/**
* SessionPool should not work before initialization.
*/
_throwIfNotInitialized() {
if (!this.isInitialized)
throw new Error('SessionPool is not initialized.');
}
/**
* Removes retired `Session` instances from `SessionPool`.
*/
_removeRetiredSessions() {
this.sessions = this.sessions.filter((storedSession) => {
if (storedSession.isUsable())
return true;
this.sessionMap.delete(storedSession.id);
this.log.debug(`Removed Session - ${storedSession.id}`);
return false;
});
}
/**
* Adds `Session` instance to `SessionPool`.
* @param newSession `Session` instance to be added.
*/
_addSession(newSession) {
this.sessions.push(newSession);
this.sessionMap.set(newSession.id, newSession);
}
/**
* Gets random index.
*/
_getRandomIndex() {
return Math.floor(Math.random() * this.sessions.length);
}
/**
* Creates new session without any extra behavior.
* @param sessionPool
* @param [options]
* @param [options.sessionOptions] The configuration options for the session being created.
* @returns New session.
*/
_defaultCreateSessionFunction(sessionPool, options = {}) {
(0, ow_1.default)(options, ow_1.default.object.exactShape({ sessionOptions: ow_1.default.optional.object }));
const { sessionOptions = {} } = options;
return new session_1.Session({
...this.sessionOptions,
...sessionOptions,
sessionPool,
});
}
/**
* Creates new session and adds it to the pool.
* @returns Newly created `Session` instance.
*/
async _createSession() {
const newSession = await this.createSessionFunction(this);
this._addSession(newSession);
this.log.debug(`Created new Session - ${newSession.id}`);
return newSession;
}
/**
* Decides whether there is enough space for creating new session.
*/
_hasSpaceForSession() {
return this.sessions.length < this.maxPoolSize;
}
/**
* Picks random session from the `SessionPool`.
* @returns Picked `Session`.
*/
_pickSession() {
return this.sessions[this._getRandomIndex()]; // Or maybe we should let the developer to customize the picking algorithm
}
/**
* Potentially loads `SessionPool`.
* If the state was persisted it loads the `SessionPool` from the persisted state.
*/
async _maybeLoadSessionPool() {
const loadedSessionPool = await this.keyValueStore.getValue(this.persistStateKey);
if (!loadedSessionPool)
return;
// Invalidate old sessions and load active sessions only
this.log.debug('Recreating state from KeyValueStore', {
persistStateKeyValueStoreId: this.persistStateKeyValueStoreId,
persistStateKey: this.persistStateKey,
});
for (const sessionObject of loadedSessionPool.sessions) {
sessionObject.sessionPool = this;
sessionObject.createdAt = new Date(sessionObject.createdAt);
sessionObject.expiresAt = new Date(sessionObject.expiresAt);
const recreatedSession = await this.createSessionFunction(this, { sessionOptions: sessionObject });
if (recreatedSession.isUsable()) {
this._addSession(recreatedSession);
}
}
this.log.debug(`${this.usableSessionsCount} active sessions loaded from KeyValueStore`);
}
/**
* Opens a SessionPool and returns a promise resolving to an instance
* of the {@link SessionPool} class that is already initialized.
*
* For more details and code examples, see the {@link SessionPool} class.
*/
static async open(options, config) {
const sessionPool = new SessionPool(options, config);
await sessionPool.initialize();
return sessionPool;
}
}
exports.SessionPool = SessionPool;
//# sourceMappingURL=session_pool.js.map