UNPKG

@clickup/ent-framework

Version:

A PostgreSQL graph-database-alike library with microsharding and row-level security

155 lines 8.13 kB
import type { PickPartial } from "../internal/misc"; import type { Client, ClientRole } from "./Client"; import type { LocalCache } from "./LocalCache"; import type { SwallowedErrorLoggerProps } from "./Loggers"; import type { Shard } from "./Shard"; /** * Options for Island constructor. */ export interface IslandOptions<TClient extends Client> { /** Island number. */ no: number; /** Clients of that Island (the order is arbitrary). */ clients: readonly TClient[]; /** Should return a Memoize'd Shards object by its number. */ createShard: (no: number) => Shard<TClient>; /** An auxillary LocalCache used to fallback-infer master/replica role in case * some Client is unavailable right now. */ localCache?: LocalCache<{ address: string; role: ClientRole; }> | null; /** If nonzero, runs the second shardNos() call attempt on a Client if the 1st * call on that Client gets stuck for longer than the provided number of ms. * * This option is used to detect the unhealthy DB connection quicker, and * thus, exit from rediscover() faster (the Shards map can likely be loaded * from a replica still, so the down DB is not the end of the world). The idea * is that the 1st shardNos() could get stuck due to the load balancer trying * to wait until the DB goes back up again (e.g. for PgBouncer, that is * query_wait_timeout situation; "pause_client" is printed to PgBouncer debug * logs, and then the Client gets frozen for up to query_wait_timeout; other * engines may have similar behavior). But for the NEW connections/queries, * after a small delay, the load balancer may realize that the DB is really * down (the load balancer typically can get "connection refused" while * connecting to the DB server really quickly), and the 2nd shardNos() call * will reject almost immediately ("fast fail" workflow), way before the 1st * call rejects (e.g. for PgBouncer and query_wait_timeout=15s, the 1st call * may get stuck for up to 15 seconds!). So, we will not wait that long to * figure out that the DB is down, and will detect that situation quicker. * * Typically, the connection attempt from load balancer to an unhealthy DB * server ends up quickly with "connection refused" TCP error (e.g. when the * load balancer and the DB server run on the same host), so the value in this * option can be small. But not always. Sometimes, the new connection from * load balancer to the DB server gets stuck in "connecting..." state (e.g. * this happens when the load balancer runs in a Docker container, and the DB * container gets killed; the connection attempt will eventually fail, but in * 1+ minutes and with "no route to host" error). In this case, the value in * the option must be greater than e.g. server_connect_timeout (example for * PgBouncer; basically, server_connect_timeout is PgBouncer's tool to detect * "stuck" connection attempts (the connections which don't get "connection * refused" quickly). */ shardNosConcurrentRetryDelayMs?: number; } /** * Island is a moderately short-lived collection of DB connections (represented * as Clients) that contains a single master Client and any number of replicas. * * - In normal situations, you don't likely need to work with Islands directly, * you can rely on higher level abstractions which support automatic * rediscovery and retries: Ent (or lower level Shard and Schema). * - Islands are helpful mostly when working with cross-Shards logic. * - Island is somewhat temporary: if the Cluster is reconfigured in real-time, * then its Island objects may be recycled and re-created, and the * corresponding Clients may be ended. This also applies to any given Client * instance. Don't retain and reuse those objects for too long. The reliable * abstractions (resilient to disconnects, shards migration, failover etc.) * start from Shard level. * - There is no guarantee that the data returned by shards(), master() or * replica() will be up to date. Shards may be just migrated to another * Island. Master may become a replica, or vice versa. */ export declare class Island<TClient extends Client> { /** Default values for the constructor options. */ static readonly DEFAULT_OPTIONS: Required<PickPartial<IslandOptions<Client>>>; /** Clients grouped based on their roles and health. */ private classifiedClients; /** In case shardNos discovery for some Client hasn't succeeded yet, and thus, * we are not sure about the role of that Client, then we try to load the role * from fallback cache in this map and use further instead of "unknown". */ private fallbackRoles; /** Recently discovered Shard numbers. */ private shardNos; /** Island configuration options. */ readonly options: Required<IslandOptions<TClient>>; /** * Initializes the Island by copying the Client references into it. */ constructor(options: IslandOptions<TClient>); /** * Island number. */ get no(): number; /** * The list of Clients in this Island. No assumptions about the order. */ get clients(): readonly TClient[]; /** * Queries for Shards on the best available Client (preferably master, then * replicas) and stores the result internally, available for the further * shards() call. * - If some Clients are unavailable, tries its best to infer the data from * other Clients. * - The method queries ALL clients in parallel, because the caller logic * anyways needs to know, who's master and who's replica, as a side effect * of the very 1st query after the Client creation. We infer that as a piggy * back after calling Client#shardNos(). * - In case we could not discover shards, returns the list of errors happened * during the discovery. */ rediscover(): Promise<SwallowedErrorLoggerProps[]>; /** * Returns the currently best-known Shards on this Island. This method is * needed only when working with cross-Shards logic; in normal situations, it * is not called much. */ shards(): Array<Shard<TClient>>; /** * Returns the currently best-known master Client among the Clients of this * Island. * * - If all masters are unhealthy, we still return one of them and prefer not * to fall back on a replica, because otherwise, we'll see non-obvious * errors in logs ("can't write in a read-only Client" or so) and suspect * that there is a bug in logic, although there is really no bug, it's just * the master node went down. It's way better to throw a straightforward * error like "Client is down". * - If we can't find a master, but there is a list of Clients with unknown * roles, prefer returning one of them vs. any known replica, since there is * a chance that among those unknown Clients, there will be a master. * - In case all Clients are read-only (replicas), still returns the 1st of * them, assuming that it's better to throw at the caller side on a failed * write (at worst) rather than here. It is not common to have an Island * without a master Client, that happens only temporarily during * failover/switchover, so the caller will likely rediscover and find a new * master on a next retry. */ master(): TClient; /** * Returns a currently best-known random replica Client. In case there are no * replicas, returns the master Client. */ replica(): TClient; /** * Updates the list of classified Clients. We try hard to not put Clients in * "unknown" group by falling back to fallbackRoles. */ private reclassifyClients; /** * Tries to pull shardNos() out of the Client and fail fast if the DB is down. * See details in shardNosConcurrentRetryDelayMs option description. */ private clientShardNos; } //# sourceMappingURL=Island.d.ts.map