@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
251 lines • 9.69 kB
TypeScript
import { EventEmitter } from 'node:events';
import type { Log } from '@apify/log';
import { Configuration } from '../configuration';
import type { PersistenceOptions } from '../crawlers/statistics';
import type { EventManager } from '../events/event_manager';
import { KeyValueStore } from '../storages/key_value_store';
import type { SessionOptions } from './session';
import { Session } from './session';
/**
* Factory user-function which creates customized {@link Session} instances.
*/
export interface CreateSession {
/**
* @param sessionPool Pool requesting the new session.
* @param options
*/
(sessionPool: SessionPool, options?: {
sessionOptions?: SessionOptions;
}): Session | Promise<Session>;
}
export interface SessionPoolOptions {
/**
* Maximum size of the pool. Indicates how many sessions are rotated.
* @default 1000
*/
maxPoolSize?: number;
/** The configuration options for {@link Session} instances. */
sessionOptions?: SessionOptions;
/** Name or Id of `KeyValueStore` where is the `SessionPool` state stored. */
persistStateKeyValueStoreId?: string;
/**
* Session pool persists it's state under this key in Key value store.
* @default SESSION_POOL_STATE
*/
persistStateKey?: string;
/**
* Custom function that should return `Session` instance.
* Any error thrown from this function will terminate the process.
* Function receives `SessionPool` instance as a parameter
*/
createSessionFunction?: CreateSession;
/**
* Specifies which response status codes are considered as blocked.
* Session connected to such request will be marked as retired.
* @default [401, 403, 429]
*/
blockedStatusCodes?: number[];
/** @internal */
log?: Log;
/**
* Control how and when to persist the state of the session pool.
*/
persistenceOptions?: PersistenceOptions;
}
/**
* Handles the rotation, creation and persistence of user-like sessions.
* Creates a pool of {@link Session} instances, that are randomly rotated.
* When some session is marked as blocked, it is removed and new one is created instead (the pool never returns an unusable session).
* Learn more in the {@doclink guides/session-management | Session management guide}.
*
* You can create one by calling the {@link SessionPool.open} function.
*
* Session pool is already integrated into crawlers, and it can significantly improve your scraper
* performance with just 2 lines of code.
*
* **Example usage:**
*
* ```javascript
* const crawler = new CheerioCrawler({
* useSessionPool: true,
* persistCookiesPerSession: true,
* // ...
* })
* ```
*
* You can configure the pool with many options. See the {@link SessionPoolOptions}.
* Session pool is by default persisted in default {@link KeyValueStore}.
* If you want to have one pool for all runs you have to specify
* {@link SessionPoolOptions.persistStateKeyValueStoreId}.
*
* **Advanced usage:**
*
* ```javascript
* const sessionPool = await SessionPool.open({
* maxPoolSize: 25,
* sessionOptions:{
* maxAgeSecs: 10,
* maxUsageCount: 150, // for example when you know that the site blocks after 150 requests.
* },
* persistStateKeyValueStoreId: 'my-key-value-store-for-sessions',
* persistStateKey: 'my-session-pool',
* });
*
* // Get random session from the pool
* const session1 = await sessionPool.getSession();
* const session2 = await sessionPool.getSession();
* const session3 = await sessionPool.getSession();
*
* // Now you can mark the session either failed or successful
*
* // Marks session as bad after unsuccessful usage -> it increases error count (soft retire)
* session1.markBad()
*
* // Marks as successful.
* session2.markGood()
*
* // Retires session -> session is removed from the pool
* session3.retire()
*
* ```
*
* **Default session allocation flow:*
* 1. Until the `SessionPool` reaches `maxPoolSize`, new sessions are created, provided to the user and added to the pool
* 2. Blocked/retired sessions stay in the pool but are never provided to the user
* 3. Once the pool is full (live plus blocked session count reaches `maxPoolSize`), a random session from the pool is provided.
* 4. If a blocked session would be picked, instead all blocked sessions are evicted from the pool and a new session is created and provided
*
* @category Scaling
*/
export declare class SessionPool extends EventEmitter {
readonly config: Configuration;
protected log: Log;
protected maxPoolSize: number;
protected createSessionFunction: CreateSession;
protected keyValueStore: KeyValueStore;
protected sessions: Session[];
protected sessionMap: Map<string, Session>;
protected sessionOptions: SessionOptions;
protected persistStateKeyValueStoreId?: string;
protected persistStateKey: string;
protected _listener: () => Promise<void>;
protected events: EventManager;
protected readonly blockedStatusCodes: number[];
protected persistenceOptions: PersistenceOptions;
protected isInitialized: boolean;
private queue;
/**
* @internal
*/
constructor(options?: SessionPoolOptions, config?: Configuration);
/**
* Gets count of usable sessions in the pool.
*/
get usableSessionsCount(): number;
/**
* Gets count of retired sessions in the pool.
*/
get retiredSessionsCount(): number;
/**
* Starts periodic state persistence and potentially loads SessionPool state from {@link KeyValueStore}.
* It is called automatically by the {@link SessionPool.open} function.
*/
initialize(): Promise<void>;
/**
* Adds a new session to the session pool. The pool automatically creates sessions up to the maximum size of the pool,
* but this allows you to add more sessions once the max pool size is reached.
* This also allows you to add session with overridden session options (e.g. with specific session id).
* @param [options] The configuration options for the session being added to the session pool.
*/
addSession(options?: Session | SessionOptions): Promise<void>;
/**
* Gets session.
* If there is space for new session, it creates and returns new session.
* If the session pool is full, it picks a session from the pool,
* If the picked session is usable it is returned, otherwise it creates and returns a new one.
*/
getSession(): Promise<Session>;
/**
* Gets session based on the provided session id or `undefined.
*/
getSession(sessionId: string): Promise<Session>;
/**
* @param options - Override the persistence options provided in the constructor
*/
resetStore(options?: PersistenceOptions): Promise<void>;
/**
* Returns an object representing the internal state of the `SessionPool` instance.
* Note that the object's fields can change in future releases.
*/
getState(): {
usableSessionsCount: number;
retiredSessionsCount: number;
// @ts-ignore optional peer dependency or compatibility with es2022
sessions: import("./session").SessionState[];
};
/**
* Persists the current state of the `SessionPool` into the default {@link KeyValueStore}.
* The state is persisted automatically in regular intervals.
* @param options - Override the persistence options provided in the constructor
*/
persistState(options?: PersistenceOptions): Promise<void>;
/**
* Removes listener from `persistState` event.
* This function should be called after you are done with using the `SessionPool` instance.
*/
teardown(): Promise<void>;
/**
* SessionPool should not work before initialization.
*/
protected _throwIfNotInitialized(): void;
/**
* Removes retired `Session` instances from `SessionPool`.
*/
protected _removeRetiredSessions(): void;
/**
* Adds `Session` instance to `SessionPool`.
* @param newSession `Session` instance to be added.
*/
protected _addSession(newSession: Session): void;
/**
* Gets random index.
*/
protected _getRandomIndex(): number;
/**
* Creates new session without any extra behavior.
* @param sessionPool
* @param [options]
* @param [options.sessionOptions] The configuration options for the session being created.
* @returns New session.
*/
protected _defaultCreateSessionFunction(sessionPool: SessionPool, options?: {
sessionOptions?: SessionOptions;
}): Session;
/**
* Creates new session and adds it to the pool.
* @returns Newly created `Session` instance.
*/
protected _createSession(): Promise<Session>;
/**
* Decides whether there is enough space for creating new session.
*/
protected _hasSpaceForSession(): boolean;
/**
* Picks random session from the `SessionPool`.
* @returns Picked `Session`.
*/
protected _pickSession(): Session;
/**
* Potentially loads `SessionPool`.
* If the state was persisted it loads the `SessionPool` from the persisted state.
*/
protected _maybeLoadSessionPool(): Promise<void>;
/**
* Opens a SessionPool and returns a promise resolving to an instance
* of the {@link SessionPool} class that is already initialized.
*
* For more details and code examples, see the {@link SessionPool} class.
*/
static open(options?: SessionPoolOptions, config?: Configuration): Promise<SessionPool>;
}
//# sourceMappingURL=session_pool.d.ts.map