@lodestar/beacon-node
Version:
A Typescript implementation of the beacon chain
452 lines • 22.1 kB
JavaScript
import path from "node:path";
import { Worker, spawn } from "@chainsafe/threads";
// `threads` library creates self global variable which breaks `timeout-abort-controller` https://github.com/jacobheun/timeout-abort-controller/issues/9
// @ts-ignore
// biome-ignore lint/suspicious/noGlobalAssign: <explanation>
self = undefined;
import { LinkedList } from "../../../util/array.js";
import { callInNextEventLoop } from "../../../util/eventLoop.js";
import { QueueError, QueueErrorCode } from "../../../util/queue/index.js";
import { verifySignatureSetsMaybeBatch } from "../maybeBatch.js";
import { getAggregatedPubkey, getAggregatedPubkeysCount } from "../utils.js";
import { JobQueueItemType, jobItemSameMessageToMultiSet, jobItemSigSets, jobItemWorkReq, } from "./jobItem.js";
import { defaultPoolSize } from "./poolSize.js";
import { WorkResultCode } from "./types.js";
import { chunkifyMaximizeChunkSize } from "./utils.js";
// Worker constructor consider the path relative to the current working directory
const workerDir = process.env.NODE_ENV === "test" ? "../../../../lib/chain/bls/multithread" : "./";
// 1 worker for the main thread
const blsPoolSize = Math.max(defaultPoolSize - 1, 1);
/**
* Split big signature sets into smaller sets so they can be sent to multiple workers.
*
* The biggest sets happen during sync, on mainnet batches of 64 blocks have around ~8000 signatures.
* The latency cost of sending the job to and from the worker is approx a single sig verification.
* If you split a big signature into 2, the extra time cost is `(2+2N)/(1+2N)`.
* For 128, the extra time cost is about 0.3%. No specific reasoning for `128`, it's just good enough.
*/
const MAX_SIGNATURE_SETS_PER_JOB = 128;
/**
* If there are more than `MAX_BUFFERED_SIGS` buffered sigs, verify them immediately without waiting `MAX_BUFFER_WAIT_MS`.
*
* The efficiency improvement of batching sets asymptotically reaches x2. However, for batching large sets
* has more risk in case a signature is invalid, requiring to revalidate all sets in the batch. 32 is sweet
* point for this tradeoff.
*/
const MAX_BUFFERED_SIGS = 32;
/**
* Gossip objects usually come in bursts. Buffering them for a short period of time allows to increase batching
* efficiency, at the cost of delaying validation. Unless running in production shows otherwise, it's not critical
* to hold attestations and aggregates for 100ms. Lodestar existing queues may hold those objects for much more anyway.
*
* There's no exact reasoning for the `100` milliseconds number. The metric `batchSigsSuccess` should indicate if this
* value needs revision
*/
const MAX_BUFFER_WAIT_MS = 100;
/**
* Max concurrent jobs on `canAcceptWork` status
*/
const MAX_JOBS_CAN_ACCEPT_WORK = 512;
var WorkerStatusCode;
(function (WorkerStatusCode) {
WorkerStatusCode[WorkerStatusCode["notInitialized"] = 0] = "notInitialized";
WorkerStatusCode[WorkerStatusCode["initializing"] = 1] = "initializing";
WorkerStatusCode[WorkerStatusCode["initializationError"] = 2] = "initializationError";
WorkerStatusCode[WorkerStatusCode["idle"] = 3] = "idle";
WorkerStatusCode[WorkerStatusCode["running"] = 4] = "running";
})(WorkerStatusCode || (WorkerStatusCode = {}));
/**
* Wraps "threads" library thread pool queue system with the goals:
* - Complete total outstanding jobs in total minimum time possible.
* Will split large signature sets into smaller sets and send to different workers
* - Reduce the latency cost for small signature sets. In NodeJS 12,14 worker <-> main thread
* communication has very high latency, of around ~5 ms. So package multiple small signature
* sets into packages of work and send at once to a worker to distribute the latency cost
*/
export class BlsMultiThreadWorkerPool {
constructor(options, modules) {
this.jobs = new LinkedList();
this.bufferedJobs = null;
this.closed = false;
this.workersBusy = 0;
/**
* Potentially submit jobs to an idle worker, only if there's a worker and jobs
*/
this.runJob = async () => {
if (this.closed) {
return;
}
// Find idle worker
const worker = this.workers.find((worker) => worker.status.code === WorkerStatusCode.idle);
if (!worker || worker.status.code !== WorkerStatusCode.idle) {
return;
}
// Prepare work package
const jobsInput = this.prepareWork();
if (jobsInput.length === 0) {
return;
}
// TODO: After sending the work to the worker the main thread can drop the job arguments
// and free-up memory, only needs to keep the job's Promise handlers.
// Maybe it's not useful since all data referenced in jobs is likely referenced by others
const workerApi = worker.status.workerApi;
worker.status = { code: WorkerStatusCode.running, workerApi };
this.workersBusy++;
try {
let startedJobsDefault = 0;
let startedJobsSameMessage = 0;
let startedSetsDefault = 0;
let startedSetsSameMessage = 0;
const workReqs = [];
const jobsStarted = [];
for (const job of jobsInput) {
this.metrics?.blsThreadPool.jobWaitTime.observe((Date.now() - job.addedTimeMs) / 1000);
let workReq;
try {
// Note: This can throw, must be handled per-job.
// Pubkey and signature aggregation is defered here
workReq = await jobItemWorkReq(job, this.metrics);
}
catch (e) {
this.metrics?.blsThreadPool.errorAggregateSignatureSetsCount.inc({ type: job.type });
switch (job.type) {
case JobQueueItemType.default:
job.reject(e);
break;
case JobQueueItemType.sameMessage:
// there could be an invalid pubkey/signature, retry each individually
this.retryJobItemSameMessage(job);
break;
}
continue;
}
// Re-push all jobs with matching workReq for easier accounting of results
workReqs.push(workReq);
jobsStarted.push(job);
if (job.type === JobQueueItemType.sameMessage) {
startedJobsSameMessage += 1;
startedSetsSameMessage += job.sets.length;
}
else {
startedJobsDefault += 1;
startedSetsDefault += job.sets.length;
}
}
const startedSigSets = startedSetsDefault + startedSetsSameMessage;
this.metrics?.blsThreadPool.totalJobsGroupsStarted.inc(1);
this.metrics?.blsThreadPool.totalJobsStarted.inc({ type: JobQueueItemType.default }, startedJobsDefault);
this.metrics?.blsThreadPool.totalJobsStarted.inc({ type: JobQueueItemType.sameMessage }, startedJobsSameMessage);
this.metrics?.blsThreadPool.totalSigSetsStarted.inc({ type: JobQueueItemType.default }, startedSetsDefault);
this.metrics?.blsThreadPool.totalSigSetsStarted.inc({ type: JobQueueItemType.sameMessage }, startedSetsSameMessage);
// Send work package to the worker
// If the job, metrics or any code below throws: the job will reject never going stale.
// Only downside is the job promise may be resolved twice, but that's not an issue
const [jobStartSec, jobStartNs] = process.hrtime();
const workResult = await workerApi.verifyManySignatureSets(workReqs);
const [jobEndSec, jobEndNs] = process.hrtime();
const { workerId, batchRetries, batchSigsSuccess, workerStartTime, workerEndTime, results } = workResult;
const [workerStartSec, workerStartNs] = workerStartTime;
const [workerEndSec, workerEndNs] = workerEndTime;
let successCount = 0;
let errorCount = 0;
// Un-wrap work package
for (let i = 0; i < jobsStarted.length; i++) {
const job = jobsStarted[i];
const jobResult = results[i];
const sigSetCount = jobItemSigSets(job);
// TODO: enable exhaustive switch case checks lint rule
switch (job.type) {
case JobQueueItemType.default:
if (!jobResult || jobResult.code !== WorkResultCode.success) {
job.reject(getJobResultError(jobResult, i));
errorCount += sigSetCount;
}
else {
job.resolve(jobResult.result);
successCount += sigSetCount;
}
break;
// handle result of the verification of aggregated signature against aggregated pubkeys
case JobQueueItemType.sameMessage:
if (!jobResult || jobResult.code !== WorkResultCode.success) {
job.reject(getJobResultError(jobResult, i));
errorCount += 1;
}
else {
if (jobResult.result) {
// All are valid, most of the time it goes here
job.resolve(job.sets.map(() => true));
}
else {
// Retry each individually
this.retryJobItemSameMessage(job);
}
successCount += 1;
}
break;
}
}
const workerJobTimeSec = workerEndSec - workerStartSec + (workerEndNs - workerStartNs) / 1e9;
const latencyToWorkerSec = workerStartSec - jobStartSec + (workerStartNs - jobStartNs) / 1e9;
const latencyFromWorkerSec = jobEndSec - workerEndSec + Number(jobEndNs - workerEndNs) / 1e9;
this.metrics?.blsThreadPool.timePerSigSet.observe(workerJobTimeSec / startedSigSets);
this.metrics?.blsThreadPool.jobsWorkerTime.inc({ workerId }, workerJobTimeSec);
this.metrics?.blsThreadPool.latencyToWorker.observe(latencyToWorkerSec);
this.metrics?.blsThreadPool.latencyFromWorker.observe(latencyFromWorkerSec);
this.metrics?.blsThreadPool.successJobsSignatureSetsCount.inc(successCount);
this.metrics?.blsThreadPool.errorJobsSignatureSetsCount.inc(errorCount);
this.metrics?.blsThreadPool.batchRetries.inc(batchRetries);
this.metrics?.blsThreadPool.batchSigsSuccess.inc(batchSigsSuccess);
}
catch (e) {
// Worker communications should never reject
if (!this.closed) {
this.logger.error("BlsMultiThreadWorkerPool error", {}, e);
}
// Reject all
for (const job of jobsInput) {
job.reject(e);
}
}
worker.status = { code: WorkerStatusCode.idle, workerApi };
this.workersBusy--;
// Potentially run a new job
callInNextEventLoop(this.runJob);
};
/**
* Add all buffered jobs to the job queue and potentially run them immediately
*/
this.runBufferedJobs = () => {
if (this.bufferedJobs) {
for (const job of this.bufferedJobs.jobs) {
this.jobs.push(job);
}
for (const job of this.bufferedJobs.prioritizedJobs) {
this.jobs.unshift(job);
}
this.bufferedJobs = null;
callInNextEventLoop(this.runJob);
}
};
const { logger, metrics } = modules;
this.logger = logger;
this.metrics = metrics;
this.blsVerifyAllMultiThread = options.blsVerifyAllMultiThread ?? false;
// Use compressed for herumi for now.
// THe worker is not able to deserialize from uncompressed
// `Error: err _wrapDeserialize`
this.workers = this.createWorkers(blsPoolSize);
if (metrics) {
metrics.blsThreadPool.queueLength.addCollect(() => {
metrics.blsThreadPool.queueLength.set(this.jobs.length);
metrics.blsThreadPool.workersBusy.set(this.workersBusy);
});
}
}
canAcceptWork() {
return (this.workersBusy < blsPoolSize &&
// TODO: Should also bound the jobs queue?
this.jobs.length < MAX_JOBS_CAN_ACCEPT_WORK);
}
async verifySignatureSets(sets, opts = {}) {
// Pubkeys are aggregated in the main thread regardless if verified in workers or in main thread
this.metrics?.bls.aggregatedPubkeys.inc(getAggregatedPubkeysCount(sets));
this.metrics?.blsThreadPool.totalSigSets.inc(sets.length);
if (opts.priority) {
this.metrics?.blsThreadPool.prioritizedSigSets.inc(sets.length);
}
if (opts.batchable) {
this.metrics?.blsThreadPool.batchableSigSets.inc(sets.length);
}
if (opts.verifyOnMainThread && !this.blsVerifyAllMultiThread) {
const timer = this.metrics?.blsThreadPool.mainThreadDurationInThreadPool.startTimer();
try {
return verifySignatureSetsMaybeBatch(sets.map((set) => ({
publicKey: getAggregatedPubkey(set),
message: set.signingRoot.valueOf(),
signature: set.signature,
})));
}
finally {
if (timer)
timer();
}
}
// Split large array of sets into smaller.
// Very helpful when syncing finalized, sync may submit +1000 sets so chunkify allows to distribute to many workers
const results = await Promise.all(chunkifyMaximizeChunkSize(sets, MAX_SIGNATURE_SETS_PER_JOB).map((setsChunk) => new Promise((resolve, reject) => {
return this.queueBlsWork({
type: JobQueueItemType.default,
resolve,
reject,
addedTimeMs: Date.now(),
opts,
sets: setsChunk,
});
})));
// .every on an empty array returns true
if (results.length === 0) {
throw Error("Empty results array");
}
return results.every((isValid) => isValid === true);
}
/**
* Verify signature sets of the same message, only supports worker verification.
*/
async verifySignatureSetsSameMessage(sets, message, opts = {}) {
// chunkify so that it reduce the risk of retrying when there is at least one invalid signature
const results = await Promise.all(chunkifyMaximizeChunkSize(sets, MAX_SIGNATURE_SETS_PER_JOB).map((setsChunk) => new Promise((resolve, reject) => {
this.queueBlsWork({
type: JobQueueItemType.sameMessage,
resolve,
reject,
addedTimeMs: Date.now(),
opts,
sets: setsChunk,
message,
});
})));
return results.flat();
}
async close() {
if (this.bufferedJobs) {
clearTimeout(this.bufferedJobs.timeout);
}
// Abort all jobs
for (const job of this.jobs) {
job.reject(new QueueError({ code: QueueErrorCode.QUEUE_ABORTED }));
}
this.jobs.clear();
// Terminate all workers. await to ensure no workers are left hanging
await Promise.all(Array.from(this.workers.entries()).map(([id, worker]) =>
// NOTE: 'threads' has not yet updated types, and NodeJS complains with
// [DEP0132] DeprecationWarning: Passing a callback to worker.terminate() is deprecated. It returns a Promise instead.
worker.worker.terminate().catch((e) => {
this.logger.error("Error terminating worker", { id }, e);
})));
}
createWorkers(poolSize) {
const workers = [];
for (let i = 0; i < poolSize; i++) {
const workerData = { workerId: i };
const worker = new Worker(path.join(workerDir, "worker.js"), {
workerData,
});
const workerDescriptor = {
worker,
status: { code: WorkerStatusCode.notInitialized },
};
workers.push(workerDescriptor);
// TODO: Consider initializing only when necessary
const initPromise = spawn(worker, {
// A Lodestar Node may do very expensive task at start blocking the event loop and causing
// the initialization to timeout. The number below is big enough to almost disable the timeout
timeout: 5 * 60 * 1000,
});
workerDescriptor.status = { code: WorkerStatusCode.initializing, initPromise };
initPromise
.then((workerApi) => {
workerDescriptor.status = { code: WorkerStatusCode.idle, workerApi };
// Potentially run jobs that were queued before initialization of the first worker
setTimeout(this.runJob, 0);
})
.catch((error) => {
workerDescriptor.status = { code: WorkerStatusCode.initializationError, error };
});
}
return workers;
}
/**
* Register BLS work to be done eventually in a worker
*/
queueBlsWork(job) {
if (this.closed) {
throw new QueueError({ code: QueueErrorCode.QUEUE_ABORTED });
}
// TODO: Consider if limiting queue size is necessary here.
// It would be bad to reject signatures because the node is slow.
// However, if the worker communication broke jobs won't ever finish
if (this.workers.length > 0 &&
this.workers[0].status.code === WorkerStatusCode.initializationError &&
this.workers.every((worker) => worker.status.code === WorkerStatusCode.initializationError)) {
job.reject(this.workers[0].status.error);
return;
}
// Append batchable sets to `bufferedJobs`, starting a timeout to push them into `jobs`.
// Do not call `runJob()`, it is called from `runBufferedJobs()`
if (job.opts.batchable) {
if (!this.bufferedJobs) {
this.bufferedJobs = {
jobs: new LinkedList(),
prioritizedJobs: new LinkedList(),
sigCount: 0,
firstPush: Date.now(),
timeout: setTimeout(this.runBufferedJobs, MAX_BUFFER_WAIT_MS),
};
}
const jobs = job.opts.priority ? this.bufferedJobs.prioritizedJobs : this.bufferedJobs.jobs;
jobs.push(job);
this.bufferedJobs.sigCount += jobItemSigSets(job);
if (this.bufferedJobs.sigCount > MAX_BUFFERED_SIGS) {
clearTimeout(this.bufferedJobs.timeout);
this.runBufferedJobs();
}
}
// Push job and schedule to call `runJob` in the next macro event loop cycle.
// This is useful to allow batching job submitted from a synchronous for loop,
// and to prevent large stacks since runJob may be called recursively.
else {
if (job.opts.priority) {
this.jobs.unshift(job);
}
else {
this.jobs.push(job);
}
callInNextEventLoop(this.runJob);
}
}
/**
* Grab pending work up to a max number of signatures
*/
prepareWork() {
const jobs = [];
let totalSigs = 0;
while (totalSigs < MAX_SIGNATURE_SETS_PER_JOB) {
const job = this.jobs.shift();
if (!job) {
break;
}
jobs.push(job);
totalSigs += jobItemSigSets(job);
}
return jobs;
}
retryJobItemSameMessage(job) {
// Create new jobs for each pubkey set, and Promise.all all the results
for (const j of jobItemSameMessageToMultiSet(job)) {
if (j.opts.priority) {
this.jobs.unshift(j);
}
else {
this.jobs.push(j);
}
}
this.metrics?.blsThreadPool.sameMessageRetryJobs.inc(1);
this.metrics?.blsThreadPool.sameMessageRetrySets.inc(job.sets.length);
}
/** For testing */
async waitTillInitialized() {
await Promise.all(this.workers.map(async (worker) => {
if (worker.status.code === WorkerStatusCode.initializing) {
await worker.status.initPromise;
}
}));
}
}
function getJobResultError(jobResult, i) {
const workerError = jobResult ? Error(jobResult.error.message) : Error(`No jobResult for index ${i}`);
if (jobResult?.error?.stack)
workerError.stack = jobResult.error.stack;
return workerError;
}
//# sourceMappingURL=index.js.map