UNPKG

@lodestar/beacon-node

Version:

A Typescript implementation of the beacon chain

628 lines (558 loc) • 23.6 kB
import type {MessageStreamDirection, PeerId} from "@libp2p/interface"; import {BitArray} from "@chainsafe/ssz"; import {ChainConfig} from "@lodestar/config"; import {ATTESTATION_SUBNET_COUNT, SYNC_COMMITTEE_SUBNET_COUNT} from "@lodestar/params"; import {CustodyIndex, Status, SubnetID, altair, phase0} from "@lodestar/types"; import {MapDef} from "@lodestar/utils"; import {shuffle} from "../../../util/shuffle.js"; import {sortBy} from "../../../util/sortBy.js"; import {NetworkCoreMetrics} from "../../core/metrics.js"; import {RequestedSubnet} from "./subnetMap.js"; /** Target number of peers we'd like to have connected to a given long-lived subnet */ const TARGET_SUBNET_PEERS = 6; /** * This is for non-sampling groups only. This is a very easy number to achieve given an average of 6.25 peers per column subnet on public networks. * This is needed to always maintain some minimum peers on all subnets so that when we publish a block, we're sure we pubish to all column subnets. */ const TARGET_GROUP_PEERS_PER_SUBNET = 4; /** * This is used in the pruning logic. We avoid pruning peers on sync-committees if doing so would * lower our peer count below this number. Instead we favour a non-uniform distribution of subnet * peers. */ const MIN_SYNC_COMMITTEE_PEERS = 2; /** * Lighthouse has this value as 0. However, as monitored in Lodestar mainnet node, the max score is 0 * and average score is -0.5 to 0 so we want this value to be a little bit more relaxed */ const LOW_SCORE_TO_PRUNE_IF_TOO_MANY_PEERS = -2; /** * Instead of attempting to connect the exact amount necessary this will overshoot a little since the success * rate of outgoing connections is low, <33%. If we try to connect exactly `targetPeers - connectedPeerCount` the * peer count will almost always be just below targetPeers triggering constant discoveries that are not necessary */ const PEERS_TO_CONNECT_OVERSHOOT_FACTOR = 3; /** * Keep at least 10% of outbound peers. For rationale, see https://github.com/ChainSafe/lodestar/issues/2215 */ const OUTBOUND_PEERS_RATIO = 0.1; const attnetsZero = BitArray.fromBitLen(ATTESTATION_SUBNET_COUNT); const syncnetsZero = BitArray.fromBitLen(SYNC_COMMITTEE_SUBNET_COUNT); type SubnetDiscvQuery = {subnet: SubnetID; toSlot: number; maxPeersToDiscover: number}; /** * A map of das custody group index to maxPeersToDiscover */ export type CustodyGroupQueries = Map<CustodyIndex, number>; /** * Comparison of our status vs a peer's status. * * The main usage of this score is to feed into peer priorization during syncing, and especially when the node is having trouble finding data during syncing * * For network stability, we DON'T distinguish peers that are far behind us vs peers that are close to us. */ enum StatusScore { /** The peer is close to our chain */ CLOSE_TO_US = -1, /** The peer is far ahead of chain */ FAR_AHEAD = 0, } /** * In practice, this score only tracks if the peer is far ahead of us or not during syncing. * When the node is synced, the peer is always CLOSE_TO_US. */ function computeStatusScore(ours: Status, theirs: Status | null, opts: PrioritizePeersOpts): StatusScore { if (theirs === null) { return StatusScore.CLOSE_TO_US; } if (theirs.finalizedEpoch > ours.finalizedEpoch) { return StatusScore.FAR_AHEAD; } if (theirs.headSlot > ours.headSlot + opts.starvationThresholdSlots) { return StatusScore.FAR_AHEAD; } // It's dangerous to downscore peers that are far behind. // This means we'd be more likely to disconnect peers that are attempting to sync, which would affect network stability. // if (ours.headSlot > theirs.headSlot + opts.starvationThresholdSlots) { // return StatusScore.FAR_BEHIND; // } return StatusScore.CLOSE_TO_US; } type PeerInfo = { id: PeerId; direction: MessageStreamDirection | null; statusScore: StatusScore; attnets: phase0.AttestationSubnets; syncnets: altair.SyncSubnets; samplingGroups: CustodyIndex[]; attnetsTrueBitIndices: number[]; syncnetsTrueBitIndices: number[]; score: number; }; export type PrioritizePeersOpts = { targetPeers: number; maxPeers: number; targetGroupPeers: number; status: Status; starved: boolean; starvationPruneRatio: number; starvationThresholdSlots: number; outboundPeersRatio?: number; targetSubnetPeers?: number; }; export enum ExcessPeerDisconnectReason { LOW_SCORE = "low_score", NO_LONG_LIVED_SUBNET = "no_long_lived_subnet", TOO_GROUPED_SUBNET = "too_grouped_subnet", FIND_BETTER_PEERS = "find_better_peers", } /** * Prioritize which peers to disconect and which to connect. Conditions: * - Reach `targetPeers` * - If we're starved for data, prune additional peers * - Don't exceed `maxPeers` * - Ensure there are enough peers per column subnets, attestation subnets and sync committee subnets * - Prioritize peers with good score * * pre-fulu samplingGroups is not used and this function returns empty custodyGroupQueries */ export function prioritizePeers( connectedPeersInfo: { id: PeerId; direction: MessageStreamDirection | null; status: Status | null; attnets: phase0.AttestationSubnets | null; syncnets: altair.SyncSubnets | null; samplingGroups: CustodyIndex[] | null; score: number; }[], activeAttnets: RequestedSubnet[], activeSyncnets: RequestedSubnet[], samplingGroups: CustodyIndex[] | undefined, opts: PrioritizePeersOpts, config: ChainConfig, metrics: NetworkCoreMetrics | null ): { peersToConnect: number; peersToDisconnect: Map<ExcessPeerDisconnectReason, PeerId[]>; attnetQueries: SubnetDiscvQuery[]; syncnetQueries: SubnetDiscvQuery[]; custodyGroupQueries: CustodyGroupQueries; } { const {targetPeers, maxPeers} = opts; let peersToConnect = 0; const peersToDisconnect = new MapDef<ExcessPeerDisconnectReason, PeerId[]>(() => []); // Pre-compute trueBitIndexes for re-use below. Set null subnets Maps to default zero value const connectedPeers = connectedPeersInfo.map( (peer): PeerInfo => ({ id: peer.id, direction: peer.direction, statusScore: computeStatusScore(opts.status, peer.status, opts), attnets: peer.attnets ?? attnetsZero, syncnets: peer.syncnets ?? syncnetsZero, samplingGroups: peer.samplingGroups ?? [], attnetsTrueBitIndices: peer.attnets?.getTrueBitIndexes() ?? [], syncnetsTrueBitIndices: peer.syncnets?.getTrueBitIndexes() ?? [], score: peer.score, }) ); const {attnetQueries, syncnetQueries, custodyGroupQueries, dutiesByPeer} = requestSubnetPeers( connectedPeers, activeAttnets, activeSyncnets, samplingGroups, opts, config, metrics ); const connectedPeerCount = connectedPeers.length; if (connectedPeerCount < targetPeers) { // Need more peers. // Instead of attempting to connect the exact amount necessary this will overshoot a little since the success // rate of outgoing connections is low, <33%. If we try to connect exactly `targetPeers - connectedPeerCount` the // peer count will almost always be just below targetPeers triggering constant discoveries that are not necessary peersToConnect = Math.min( PEERS_TO_CONNECT_OVERSHOOT_FACTOR * (targetPeers - connectedPeerCount), // Never attempt to connect more peers than maxPeers even considering a low chance of dial success maxPeers - connectedPeerCount ); } else if (connectedPeerCount > targetPeers) { pruneExcessPeers(connectedPeers, dutiesByPeer, activeAttnets, peersToDisconnect, opts); } return { peersToConnect, peersToDisconnect, attnetQueries, syncnetQueries, custodyGroupQueries, }; } /** * If more peers are needed in attnets and syncnets and column subnets, create SubnetDiscvQuery for each subnet * pre-fulu samplingGroups is not used and this function returns empty custodyGroupQueries */ function requestSubnetPeers( connectedPeers: PeerInfo[], activeAttnets: RequestedSubnet[], activeSyncnets: RequestedSubnet[], ourSamplingGroups: CustodyIndex[] | undefined, opts: PrioritizePeersOpts, config: ChainConfig, metrics: NetworkCoreMetrics | null ): { attnetQueries: SubnetDiscvQuery[]; syncnetQueries: SubnetDiscvQuery[]; custodyGroupQueries: CustodyGroupQueries; dutiesByPeer: Map<PeerInfo, number>; } { const {targetSubnetPeers = TARGET_SUBNET_PEERS} = opts; const attnetQueries: SubnetDiscvQuery[] = []; const syncnetQueries: SubnetDiscvQuery[] = []; // To filter out peers containing enough attnets of interest from possible disconnection const dutiesByPeer = new Map<PeerInfo, number>(); // attnets, do we need queries for more peers if (activeAttnets.length > 0) { /** Map of peers per subnet, peer may be in multiple arrays */ const peersPerSubnet = new Map<number, number>(); for (const peer of connectedPeers) { const trueBitIndices = peer.attnetsTrueBitIndices; let dutyCount = 0; for (const {subnet} of activeAttnets) { if (trueBitIndices.includes(subnet)) { dutyCount += 1; peersPerSubnet.set(subnet, 1 + (peersPerSubnet.get(subnet) ?? 0)); } } dutiesByPeer.set(peer, dutyCount); } for (const {subnet, toSlot} of activeAttnets) { const peersInSubnet = peersPerSubnet.get(subnet) ?? 0; if (peersInSubnet < targetSubnetPeers) { // We need more peers attnetQueries.push({subnet, toSlot, maxPeersToDiscover: targetSubnetPeers - peersInSubnet}); } } } // syncnets, do we need queries for more peers if (activeSyncnets.length > 0) { /** Map of peers per subnet, peer may be in multiple arrays */ const peersPerSubnet = new Map<number, number>(); for (const peer of connectedPeers) { const trueBitIndices = peer.syncnetsTrueBitIndices; let dutyCount = dutiesByPeer.get(peer) ?? 0; for (const {subnet} of activeSyncnets) { if (trueBitIndices.includes(subnet)) { dutyCount += 1; peersPerSubnet.set(subnet, 1 + (peersPerSubnet.get(subnet) ?? 0)); } } dutiesByPeer.set(peer, dutyCount); } for (const {subnet, toSlot} of activeSyncnets) { const peersInSubnet = peersPerSubnet.get(subnet) ?? 0; if (peersInSubnet < targetSubnetPeers) { // We need more peers syncnetQueries.push({subnet, toSlot, maxPeersToDiscover: targetSubnetPeers - peersInSubnet}); } } } const custodyGroupQueries: CustodyGroupQueries = new Map(); // pre-fulu if (ourSamplingGroups == null) { return {attnetQueries, syncnetQueries, custodyGroupQueries, dutiesByPeer}; } // column subnets, do we need queries for more peers const targetGroupPeersPerSamplingGroup = opts.targetGroupPeers; const peersPerGroup = new Map<CustodyIndex, number>(); for (const peer of connectedPeers) { const peerSamplingGroups = peer.samplingGroups; for (const group of peerSamplingGroups) { peersPerGroup.set(group, 1 + (peersPerGroup.get(group) ?? 0)); } } const ourSamplingGroupSet = new Set(ourSamplingGroups); for (let groupIndex = 0; groupIndex < config.NUMBER_OF_CUSTODY_GROUPS; groupIndex++) { const peersInGroup = peersPerGroup.get(groupIndex) ?? 0; metrics?.peerCountPerSamplingGroup.set({groupIndex}, peersInGroup); const targetGroupPeers = ourSamplingGroupSet.has(groupIndex) ? targetGroupPeersPerSamplingGroup : TARGET_GROUP_PEERS_PER_SUBNET; if (peersInGroup < targetGroupPeers) { // We need more peers custodyGroupQueries.set(groupIndex, targetGroupPeers - peersInGroup); } } return {attnetQueries, syncnetQueries, custodyGroupQueries, dutiesByPeer}; } /** * Remove excess peers back down to our target values. * 1. Remove peers that are not subscribed to a subnet (they have less value) * 2. Remove worst scoring peers * 3. Remove peers that we have many on any particular subnet * - Only consider removing peers on subnet that has > TARGET_SUBNET_PEERS to be safe * - If we have a choice, do not remove peer that would drop us below targetPeersPerAttnetSubnet * - If we have a choice, do not remove peer that would drop us below MIN_SYNC_COMMITTEE_PEERS * * Although the logic looks complicated, we'd prune 5 peers max per heartbeat based on the mainnet config. */ function pruneExcessPeers( connectedPeers: PeerInfo[], dutiesByPeer: Map<PeerInfo, number>, activeAttnets: RequestedSubnet[], peersToDisconnect: MapDef<ExcessPeerDisconnectReason, PeerId[]>, opts: PrioritizePeersOpts ): void { const {targetPeers, targetSubnetPeers = TARGET_SUBNET_PEERS, outboundPeersRatio = OUTBOUND_PEERS_RATIO} = opts; const connectedPeerCount = connectedPeers.length; const outboundPeersTarget = Math.round(outboundPeersRatio * connectedPeerCount); // Count outbound peers let outboundPeers = 0; for (const peer of connectedPeers) { if (peer.direction === "outbound") { outboundPeers++; } } let outboundPeersEligibleForPruning = 0; const sortedPeers = sortPeersToPrune(connectedPeers, dutiesByPeer); const peersEligibleForPruning = sortedPeers // Then, iterate from highest score to lowest doing a manual filter for duties and outbound ratio .filter((peer) => { // Peers with duties are not eligible for pruning if ((dutiesByPeer.get(peer) ?? 0) > 0) { return false; } // Peers far ahead when we're starved for data are not eligible for pruning if (opts.starved && peer.statusScore === StatusScore.FAR_AHEAD) { return false; } // outbound peers up to OUTBOUND_PEER_RATIO sorted by highest score and not eligible for pruning if (peer.direction === "outbound") { if (outboundPeers - outboundPeersEligibleForPruning > outboundPeersTarget) { outboundPeersEligibleForPruning++; } else { return false; } } return true; }); let peersToDisconnectCount = 0; const noLongLivedSubnetPeersToDisconnect: PeerId[] = []; const peersToDisconnectTarget = // if we're starved for data, prune additional peers connectedPeerCount - targetPeers + (opts.starved ? targetPeers * opts.starvationPruneRatio : 0); // 1. Lodestar prefers disconnecting peers that does not have long lived subnets // See https://github.com/ChainSafe/lodestar/issues/3940 // peers with low score will be disconnected through heartbeat in the end for (const peer of peersEligibleForPruning) { const hasLongLivedSubnet = peer.attnetsTrueBitIndices.length > 0 || peer.syncnetsTrueBitIndices.length > 0; if (!hasLongLivedSubnet && peersToDisconnectCount < peersToDisconnectTarget) { noLongLivedSubnetPeersToDisconnect.push(peer.id); peersToDisconnectCount++; } } peersToDisconnect.set(ExcessPeerDisconnectReason.NO_LONG_LIVED_SUBNET, noLongLivedSubnetPeersToDisconnect); // 2. Disconnect peers that have score < LOW_SCORE_TO_PRUNE_IF_TOO_MANY_PEERS const badScorePeersToDisconnect: PeerId[] = []; for (const peer of peersEligibleForPruning) { if ( peer.score < LOW_SCORE_TO_PRUNE_IF_TOO_MANY_PEERS && peersToDisconnectCount < peersToDisconnectTarget && !noLongLivedSubnetPeersToDisconnect.includes(peer.id) ) { badScorePeersToDisconnect.push(peer.id); peersToDisconnectCount++; } } peersToDisconnect.set(ExcessPeerDisconnectReason.LOW_SCORE, badScorePeersToDisconnect); // 3. Disconnect peers that are too grouped on any given subnet const tooGroupedPeersToDisconnect: PeerId[] = []; if (peersToDisconnectCount < peersToDisconnectTarget) { // PeerInfo array by attestation subnet const subnetToPeers = new MapDef<number, PeerInfo[]>(() => []); // number of peers per long lived sync committee const syncCommitteePeerCount = new MapDef<number, number>(() => 0); // populate the above variables for (const peer of connectedPeers) { if (noLongLivedSubnetPeersToDisconnect.includes(peer.id) || badScorePeersToDisconnect.includes(peer.id)) { continue; } for (const subnet of peer.attnetsTrueBitIndices) { subnetToPeers.getOrDefault(subnet).push(peer); } for (const subnet of peer.syncnetsTrueBitIndices) { syncCommitteePeerCount.set(subnet, 1 + syncCommitteePeerCount.getOrDefault(subnet)); } } while (peersToDisconnectCount < peersToDisconnectTarget) { const maxPeersSubnet = findMaxPeersSubnet(subnetToPeers, targetSubnetPeers); // peers are NOT too grouped on any given subnet, finish this loop if (maxPeersSubnet === null) { break; } const peersOnMostGroupedSubnet = subnetToPeers.get(maxPeersSubnet); if (peersOnMostGroupedSubnet === undefined) { break; } // Find peers to remove from the current maxPeersSubnet const removedPeer = findPeerToRemove( subnetToPeers, syncCommitteePeerCount, peersOnMostGroupedSubnet, targetSubnetPeers, activeAttnets ); // If we have successfully found a candidate peer to prune, prune it, // otherwise all peers on this subnet should not be removed. // In this case, we remove all peers from the pruning logic and try another subnet. if (removedPeer != null) { // recalculate variables removePeerFromSubnetToPeers(subnetToPeers, removedPeer); decreaseSynccommitteePeerCount(syncCommitteePeerCount, removedPeer.syncnetsTrueBitIndices); tooGroupedPeersToDisconnect.push(removedPeer.id); peersToDisconnectCount++; } else { // no peer to remove from the maxPeersSubnet // should continue with the 2nd biggest maxPeersSubnet subnetToPeers.delete(maxPeersSubnet); } } peersToDisconnect.set(ExcessPeerDisconnectReason.TOO_GROUPED_SUBNET, tooGroupedPeersToDisconnect); // 4. Ensure to always to prune to target peers // In rare case, all peers may have duties and good score but very low long lived subnet, // and not too grouped to any subnets, we need to always disconnect peers until it reaches targetPeers // because we want to keep improving peers (long lived subnets + score) // otherwise we'll not able to accept new peer connection to consider better peers // see https://github.com/ChainSafe/lodestar/issues/5198 const remainingPeersToDisconnect: PeerId[] = []; for (const {id} of sortedPeers) { if (peersToDisconnectCount >= peersToDisconnectTarget) { break; } if ( noLongLivedSubnetPeersToDisconnect.includes(id) || badScorePeersToDisconnect.includes(id) || tooGroupedPeersToDisconnect.includes(id) ) { continue; } remainingPeersToDisconnect.push(id); peersToDisconnectCount++; } peersToDisconnect.set(ExcessPeerDisconnectReason.FIND_BETTER_PEERS, remainingPeersToDisconnect); } } /** * Sort peers ascending, peer-0 has the most chance to prune, peer-n has the least. * Shuffling first to break ties. * prefer sorting by status score (applicable during syncing), then dutied subnets, then number of long lived subnets, then peer score * peer score is the last criteria since they are supposed to be in the same score range, * bad score peers are removed by peer manager anyway */ export function sortPeersToPrune(connectedPeers: PeerInfo[], dutiesByPeer: Map<PeerInfo, number>): PeerInfo[] { return shuffle(connectedPeers).sort((p1, p2) => { const dutiedSubnet1 = dutiesByPeer.get(p1) ?? 0; const dutiedSubnet2 = dutiesByPeer.get(p2) ?? 0; if (dutiedSubnet1 === dutiedSubnet2) { const statusScore = p1.statusScore - p2.statusScore; if (statusScore !== 0) { return statusScore; } const [longLivedSubnets1, longLivedSubnets2] = [p1, p2].map( (p) => p.attnetsTrueBitIndices.length + p.syncnetsTrueBitIndices.length ); if (longLivedSubnets1 === longLivedSubnets2) { return p1.score - p2.score; } return longLivedSubnets1 - longLivedSubnets2; } return dutiedSubnet1 - dutiedSubnet2; }); } /** * Find subnet that has the most peers and > TARGET_SUBNET_PEERS, return null if peers are not grouped * to any subnets. */ function findMaxPeersSubnet(subnetToPeers: Map<number, PeerInfo[]>, targetSubnetPeers: number): SubnetID | null { let maxPeersSubnet: SubnetID | null = null; let maxPeerCountPerSubnet = -1; for (const [subnet, peers] of subnetToPeers) { if (peers.length > targetSubnetPeers && peers.length > maxPeerCountPerSubnet) { maxPeersSubnet = subnet; maxPeerCountPerSubnet = peers.length; } } return maxPeersSubnet; } /** * Find peers to remove from the current maxPeersSubnet. * In the long term, this logic will help us gradually find peers with more long lived subnet. * Return null if we should not remove any peer on the most grouped subnet. */ function findPeerToRemove( subnetToPeers: Map<number, PeerInfo[]>, syncCommitteePeerCount: Map<number, number>, peersOnMostGroupedSubnet: PeerInfo[], targetSubnetPeers: number, activeAttnets: RequestedSubnet[] ): PeerInfo | null { const peersOnSubnet = sortBy(peersOnMostGroupedSubnet, (peer) => peer.attnetsTrueBitIndices.length); let removedPeer: PeerInfo | null = null; for (const candidatePeer of peersOnSubnet) { // new logic of lodestar const attnetIndices = candidatePeer.attnetsTrueBitIndices; if (attnetIndices.length > 0) { const requestedSubnets = activeAttnets.map((activeAttnet) => activeAttnet.subnet); let minAttnetCount = ATTESTATION_SUBNET_COUNT; // intersection of requested subnets and subnets that peer subscribes to for (const subnet of requestedSubnets) { const numSubnetPeers = subnetToPeers.get(subnet)?.length; if (numSubnetPeers !== undefined && numSubnetPeers < minAttnetCount && attnetIndices.includes(subnet)) { minAttnetCount = numSubnetPeers; } } // shouldn't remove this peer because it drops us below targetSubnetPeers if (minAttnetCount <= targetSubnetPeers) { continue; } } // same logic to lighthouse const syncnetIndices = candidatePeer.syncnetsTrueBitIndices; // The peer is subscribed to some long-lived sync-committees if (syncnetIndices.length > 0) { const minSubnetCount = Math.min(...syncnetIndices.map((subnet) => syncCommitteePeerCount.get(subnet) ?? 0)); // If the minimum count is our target or lower, we // shouldn't remove this peer, because it drops us lower // than our target if (minSubnetCount <= MIN_SYNC_COMMITTEE_PEERS) { continue; } } // ok, found a peer to remove removedPeer = candidatePeer; break; } return removedPeer; } /** * Remove a peer from subnetToPeers map. */ function removePeerFromSubnetToPeers(subnetToPeers: Map<number, PeerInfo[]>, removedPeer: PeerInfo): void { for (const peers of subnetToPeers.values()) { const index = peers.findIndex((peer) => peer === removedPeer); if (index >= 0) { peers.splice(index, 1); } } } /** * Decrease the syncCommitteePeerCount from the specified committees set */ function decreaseSynccommitteePeerCount( syncCommitteePeerCount: MapDef<number, number>, committees: number[] | undefined ): void { if (committees) { for (const syncCommittee of committees) { syncCommitteePeerCount.set(syncCommittee, Math.max(syncCommitteePeerCount.getOrDefault(syncCommittee) - 1, 0)); } } }