UNPKG

trigger.dev

Version:

A Command-Line Interface for Trigger.dev (v3) projects

1,097 lines • 50.1 kB
import { CoordinatorToProdWorkerMessages, PostStartCauses, PreStopCauses, ProdWorkerToCoordinatorMessages, TaskRunErrorCodes, WorkerManifest, } from "@trigger.dev/core/v3"; import { EXIT_CODE_CHILD_NONZERO, ExponentialBackoff, HttpReply, SimpleLogger, getRandomPortNumber, } from "@trigger.dev/core/v3/apps"; import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket"; import { Evt } from "evt"; import { randomUUID } from "node:crypto"; import { readFile } from "node:fs/promises"; import { createServer } from "node:http"; import { setTimeout as timeout } from "node:timers/promises"; import { logger as cliLogger } from "../utilities/logger.js"; import { TaskRunProcess, } from "../executions/taskRunProcess.js"; import { checkpointSafeTimeout, unboundedTimeout } from "@trigger.dev/core/v3/utils/timers"; import { env } from "std-env"; const HTTP_SERVER_PORT = Number(env.HTTP_SERVER_PORT || getRandomPortNumber()); const COORDINATOR_HOST = env.COORDINATOR_HOST || "127.0.0.1"; const COORDINATOR_PORT = Number(env.COORDINATOR_PORT || 50080); const MACHINE_NAME = env.MACHINE_NAME || "local"; const POD_NAME = env.POD_NAME || "some-pod"; const SHORT_HASH = env.TRIGGER_CONTENT_HASH.slice(0, 9); const logger = new SimpleLogger(`[${MACHINE_NAME}][${SHORT_HASH}]`); const defaultBackoff = new ExponentialBackoff("FullJitter", { maxRetries: 7, }); cliLogger.loggerLevel = "debug"; cliLogger.debug("Starting prod worker", { env, }); class ProdWorker { workerManifest; host; contentHash = env.TRIGGER_CONTENT_HASH; projectRef = env.TRIGGER_PROJECT_REF; envId = env.TRIGGER_ENV_ID; runId = env.TRIGGER_RUN_ID; deploymentId = env.TRIGGER_DEPLOYMENT_ID; deploymentVersion = env.TRIGGER_DEPLOYMENT_VERSION; runningInKubernetes = !!env.KUBERNETES_PORT; executing = false; completed = new Set(); paused = false; attemptFriendlyId; attemptNumber; nextResumeAfter; waitForPostStart = false; connectionCount = 0; restoreNotification = Evt.create(); waitForTaskReplay; waitForBatchReplay; readyForLazyAttemptReplay; durationResumeFallback; readyForResumeReplay; #httpPort; #httpServer; #coordinatorSocket; _taskRunProcess; constructor(port, workerManifest, host = "0.0.0.0") { this.workerManifest = workerManifest; this.host = host; process.on("SIGTERM", this.#handleSignal.bind(this, "SIGTERM")); this.#coordinatorSocket = this.#createCoordinatorSocket(COORDINATOR_HOST); this.#httpPort = port; this.#httpServer = this.#createHttpServer(); } async #handleSignal(signal) { logger.log("Received signal", { signal }); if (signal === "SIGTERM") { let gracefulExitTimeoutElapsed = false; if (this.executing) { const terminationGracePeriodSeconds = 60 * 60; logger.log("Waiting for attempt to complete before exiting", { terminationGracePeriodSeconds, }); // Wait for termination grace period minus 5s to give cleanup a chance to complete await timeout(terminationGracePeriodSeconds * 1000 - 5000); gracefulExitTimeoutElapsed = true; logger.log("Termination timeout reached, exiting gracefully."); } else { logger.log("Not executing, exiting immediately."); } await this.#exitGracefully(gracefulExitTimeoutElapsed); return; } logger.log("Unhandled signal", { signal }); } async #exitGracefully(gracefulExitTimeoutElapsed = false, exitCode = 0) { if (this._taskRunProcess) { this._taskRunProcess.onTaskRunHeartbeat.detach(); this._taskRunProcess.onWaitForDuration.detach(); await this._taskRunProcess.kill(); } if (!gracefulExitTimeoutElapsed) { // TODO: Maybe add a sensible timeout instead of a conditional to avoid zombies process.exit(exitCode); } } async #reconnectAfterPostStart() { this.waitForPostStart = false; this.#coordinatorSocket.close(); this.connectionCount = 0; let coordinatorHost = COORDINATOR_HOST; try { if (this.runningInKubernetes) { coordinatorHost = (await readFile("/etc/taskinfo/coordinator-host", "utf-8")).replace("\n", ""); logger.log("reconnecting", { coordinatorHost: { fromEnv: COORDINATOR_HOST, fromVolume: coordinatorHost, current: this.#coordinatorSocket.socket.io.opts.hostname, }, }); } } catch (error) { logger.error("taskinfo read error during reconnect", { error: error instanceof Error ? error.message : error, }); } finally { this.#coordinatorSocket = this.#createCoordinatorSocket(coordinatorHost); } } // MARK: TASK WAIT async #handleOnWaitForTask(message, replayIdempotencyKey) { logger.log("onWaitForTask", { message }); if (this.nextResumeAfter) { logger.error("Already waiting for resume, skipping wait for task", { nextResumeAfter: this.nextResumeAfter, }); return; } const waitForTask = await defaultBackoff.execute(async ({ retry }) => { logger.log("Wait for task with backoff", { retry }); if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); throw new ExponentialBackoff.StopRetrying("No attempt ID"); } return await this.#coordinatorSocket.socket.timeout(20_000).emitWithAck("WAIT_FOR_TASK", { version: "v2", friendlyId: message.friendlyId, attemptFriendlyId: this.attemptFriendlyId, }); }); if (!waitForTask.success) { logger.error("Failed to wait for task with backoff", { cause: waitForTask.cause, error: waitForTask.error, }); this.#emitUnrecoverableError("WaitForTaskFailed", `${waitForTask.cause}: ${waitForTask.error}`); return; } const { willCheckpointAndRestore } = waitForTask.result; await this.#prepareForWait("WAIT_FOR_TASK", willCheckpointAndRestore); if (willCheckpointAndRestore) { // We need to replay this on next connection if we don't receive RESUME_AFTER_DEPENDENCY within a reasonable time if (!this.waitForTaskReplay) { this.waitForTaskReplay = { message, attempt: 1, idempotencyKey: randomUUID(), }; } else { if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForTaskReplay.idempotencyKey) { logger.error("wait for task handler called with mismatched idempotency key, won't overwrite replay request"); return; } this.waitForTaskReplay.attempt++; } } } // MARK: BATCH WAIT async #handleOnWaitForBatch(message, replayIdempotencyKey) { logger.log("onWaitForBatch", { message }); if (this.nextResumeAfter) { logger.error("Already waiting for resume, skipping wait for batch", { nextResumeAfter: this.nextResumeAfter, }); return; } const waitForBatch = await defaultBackoff.execute(async ({ retry }) => { logger.log("Wait for batch with backoff", { retry }); if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); throw new ExponentialBackoff.StopRetrying("No attempt ID"); } return await this.#coordinatorSocket.socket.timeout(20_000).emitWithAck("WAIT_FOR_BATCH", { version: "v2", batchFriendlyId: message.batchFriendlyId, runFriendlyIds: message.runFriendlyIds, attemptFriendlyId: this.attemptFriendlyId, }); }); if (!waitForBatch.success) { logger.error("Failed to wait for batch with backoff", { cause: waitForBatch.cause, error: waitForBatch.error, }); this.#emitUnrecoverableError("WaitForBatchFailed", `${waitForBatch.cause}: ${waitForBatch.error}`); return; } const { willCheckpointAndRestore } = waitForBatch.result; await this.#prepareForWait("WAIT_FOR_BATCH", willCheckpointAndRestore); if (willCheckpointAndRestore) { // We need to replay this on next connection if we don't receive RESUME_AFTER_DEPENDENCY within a reasonable time if (!this.waitForBatchReplay) { this.waitForBatchReplay = { message, attempt: 1, idempotencyKey: randomUUID(), }; } else { if (replayIdempotencyKey && replayIdempotencyKey !== this.waitForBatchReplay.idempotencyKey) { logger.error("wait for task handler called with mismatched idempotency key, won't overwrite replay request"); return; } this.waitForBatchReplay.attempt++; } } } async #prepareForWait(reason, willCheckpointAndRestore) { logger.log(`prepare for ${reason}`, { willCheckpointAndRestore }); if (this.nextResumeAfter) { logger.error("Already waiting for resume, skipping prepare for wait", { nextResumeAfter: this.nextResumeAfter, params: { reason, willCheckpointAndRestore, }, }); return; } if (!willCheckpointAndRestore) { return; } this.paused = true; this.nextResumeAfter = reason; this.waitForPostStart = true; await this.#prepareForCheckpoint(); } // MARK: RETRY PREP async #prepareForRetry() { // Clear state for retrying this.paused = false; this.nextResumeAfter = undefined; this.waitForPostStart = false; this.executing = false; this.attemptFriendlyId = undefined; this.attemptNumber = undefined; // Clear replay state this.waitForTaskReplay = undefined; this.waitForBatchReplay = undefined; this.readyForLazyAttemptReplay = undefined; this.durationResumeFallback = undefined; this.readyForResumeReplay = undefined; } // MARK: CHECKPOINT PREP async #prepareForCheckpoint(flush = true) { if (flush) { // Flush before checkpointing so we don't flush the same spans again after restore try { await this._taskRunProcess?.cleanup(false); } catch (error) { logger.error("Failed to flush telemetry while preparing for checkpoint, will proceed anyway", { error }); } } try { // Kill the previous worker process to prevent large checkpoints // TODO: do we need this? // await this.#backgroundWorker.forceKillOldTaskRunProcesses(); } catch (error) { logger.error("Failed to kill previous worker while preparing for checkpoint, will proceed anyway", { error }); } this.#readyForCheckpoint(); } #resumeAfterDuration() { this.paused = false; this.nextResumeAfter = undefined; this.waitForPostStart = false; this.durationResumeFallback = undefined; this.readyForResumeReplay = undefined; this._taskRunProcess?.waitCompletedNotification(); } async #readyForLazyAttempt() { const idempotencyKey = randomUUID(); logger.log("ready for lazy attempt", { idempotencyKey }); this.readyForLazyAttemptReplay = { idempotencyKey, }; // Retry if we don't receive EXECUTE_TASK_RUN_LAZY_ATTEMPT in a reasonable time // ..but we also have to be fast to avoid failing the task due to missing heartbeat for await (const { delay, retry } of defaultBackoff.min(10).maxRetries(7)) { if (retry > 0) { logger.log("retrying ready for lazy attempt", { retry, idempotencyKey }); } this.#coordinatorSocket.socket.emit("READY_FOR_LAZY_ATTEMPT", { version: "v1", runId: this.runId, totalCompletions: this.completed.size, }); await timeout(delay.milliseconds); if (!this.readyForLazyAttemptReplay) { logger.log("replay ready for lazy attempt cancelled, discarding", { idempotencyKey, }); return; } if (idempotencyKey !== this.readyForLazyAttemptReplay.idempotencyKey) { logger.log("replay ready for lazy attempt idempotency key mismatch, discarding", { idempotencyKey, newIdempotencyKey: this.readyForLazyAttemptReplay.idempotencyKey, }); return; } } // Fail the task with a more descriptive message as it likely failed with a generic missing heartbeat error this.#failRun(this.runId, "Failed to receive execute request in a reasonable time"); } async #readyForResume() { const idempotencyKey = randomUUID(); logger.log("readyForResume()", { nextResumeAfter: this.nextResumeAfter, attemptFriendlyId: this.attemptFriendlyId, attemptNumber: this.attemptNumber, idempotencyKey, }); if (!this.nextResumeAfter) { logger.error("Missing next resume reason", { status: this.#status }); this.#emitUnrecoverableError("NoNextResume", "Next resume reason not set while resuming from paused state"); return; } if (!this.attemptFriendlyId) { logger.error("Missing attempt friendly ID", { status: this.#status }); this.#emitUnrecoverableError("NoAttemptId", "Attempt ID not set while resuming from paused state"); return; } if (!this.attemptNumber) { logger.error("Missing attempt number", { status: this.#status }); this.#emitUnrecoverableError("NoAttemptNumber", "Attempt number not set while resuming from paused state"); return; } this.readyForResumeReplay = { idempotencyKey, type: this.nextResumeAfter, }; const lockedMetadata = { attemptFriendlyId: this.attemptFriendlyId, attemptNumber: this.attemptNumber, type: this.nextResumeAfter, }; // Retry if we don't receive RESUME_AFTER_DEPENDENCY or RESUME_AFTER_DURATION in a reasonable time // ..but we also have to be fast to avoid failing the task due to missing heartbeat for await (const { delay, retry } of defaultBackoff.min(10).maxRetries(7)) { if (retry > 0) { logger.log("retrying ready for resume", { retry, idempotencyKey }); } this.#coordinatorSocket.socket.emit("READY_FOR_RESUME", { version: "v2", ...lockedMetadata, }); await timeout(delay.milliseconds); if (!this.readyForResumeReplay) { logger.log("replay ready for resume cancelled, discarding", { idempotencyKey, }); return; } if (idempotencyKey !== this.readyForResumeReplay.idempotencyKey) { logger.log("replay ready for resume idempotency key mismatch, discarding", { idempotencyKey, newIdempotencyKey: this.readyForResumeReplay.idempotencyKey, }); return; } } } #readyForCheckpoint() { this.#coordinatorSocket.socket.emit("READY_FOR_CHECKPOINT", { version: "v1" }); } #failRun(anyRunId, error) { logger.error("Failing run", { anyRunId, error }); const completion = { ok: false, id: anyRunId, retry: undefined, error: error instanceof Error ? { type: "BUILT_IN_ERROR", name: error.name, message: error.message, stackTrace: error.stack ?? "", } : { type: "BUILT_IN_ERROR", name: "UnknownError", message: String(error), stackTrace: "", }, }; this.#coordinatorSocket.socket.emit("TASK_RUN_FAILED_TO_RUN", { version: "v1", completion, }); } // MARK: ATTEMPT COMPLETION async #submitAttemptCompletion(execution, completion, replayIdempotencyKey) { const taskRunCompleted = await defaultBackoff.execute(async ({ retry }) => { logger.log("Submit attempt completion with backoff", { retry }); return await this.#coordinatorSocket.socket .timeout(20_000) .emitWithAck("TASK_RUN_COMPLETED", { version: "v2", execution, completion, }); }); if (!taskRunCompleted.success) { logger.error("Failed to complete lazy attempt with backoff", { cause: taskRunCompleted.cause, error: taskRunCompleted.error, }); this.#failRun(execution.run.id, taskRunCompleted.error); return; } const { willCheckpointAndRestore, shouldExit } = taskRunCompleted.result; logger.log("completion acknowledged", { willCheckpointAndRestore, shouldExit }); const isNonZeroExitError = !completion.ok && completion.error.type === "INTERNAL_ERROR" && completion.error.code === TaskRunErrorCodes.TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE; const exitCode = isNonZeroExitError ? EXIT_CODE_CHILD_NONZERO : 0; if (shouldExit) { // Exit after completion, without any retrying await this.#exitGracefully(false, exitCode); } else { // We aren't exiting, so we need to prepare for the next attempt await this.#prepareForRetry(); } if (willCheckpointAndRestore) { logger.error("This worker should never be checkpointed between attempts. This is a bug."); } } #returnValidatedExtraHeaders(headers) { for (const [key, value] of Object.entries(headers)) { if (value === undefined) { throw new Error(`Extra header is undefined: ${key}`); } } return headers; } // MARK: COORDINATOR SOCKET #createCoordinatorSocket(host) { const extraHeaders = this.#returnValidatedExtraHeaders({ "x-machine-name": MACHINE_NAME, "x-pod-name": POD_NAME, "x-trigger-content-hash": this.contentHash, "x-trigger-project-ref": this.projectRef, "x-trigger-env-id": this.envId, "x-trigger-deployment-id": this.deploymentId, "x-trigger-run-id": this.runId, "x-trigger-deployment-version": this.deploymentVersion, }); if (this.attemptFriendlyId) { extraHeaders["x-trigger-attempt-friendly-id"] = this.attemptFriendlyId; } if (this.attemptNumber !== undefined) { extraHeaders["x-trigger-attempt-number"] = String(this.attemptNumber); } logger.log(`connecting to coordinator: ${host}:${COORDINATOR_PORT}`); logger.debug(`connecting with extra headers`, { extraHeaders }); const coordinatorConnection = new ZodSocketConnection({ namespace: "prod-worker", host, port: COORDINATOR_PORT, clientMessages: ProdWorkerToCoordinatorMessages, serverMessages: CoordinatorToProdWorkerMessages, extraHeaders, ioOptions: { reconnectionDelay: 1000, reconnectionDelayMax: 3000, }, handlers: { RESUME_AFTER_DEPENDENCY: async ({ attemptId, completions }) => { logger.log("Handling RESUME_AFTER_DEPENDENCY", { attemptId, completions: completions.map((c) => ({ id: c.id, ok: c.ok, })), }); if (!this.paused) { logger.error("Failed to resume after dependency: Worker not paused"); return; } if (completions.length === 0) { logger.error("Failed to resume after dependency: No completions"); return; } if (this.nextResumeAfter !== "WAIT_FOR_TASK" && this.nextResumeAfter !== "WAIT_FOR_BATCH") { logger.error("Failed to resume after dependency: Invalid next resume", { nextResumeAfter: this.nextResumeAfter, }); return; } if (this.nextResumeAfter === "WAIT_FOR_TASK" && completions.length > 1) { logger.error("Failed to resume after dependency: Waiting for single task but got multiple completions", { completions: completions, }); return; } const firstCompletion = completions[0]; if (!firstCompletion) { logger.error("Failed to resume after dependency: No first completion", { completions, waitForTaskReplay: this.waitForTaskReplay, nextResumeAfter: this.nextResumeAfter, }); return; } switch (this.nextResumeAfter) { case "WAIT_FOR_TASK": { if (this.waitForTaskReplay) { if (this.waitForTaskReplay.message.friendlyId !== firstCompletion.id) { logger.error("Failed to resume after dependency: Task friendlyId mismatch", { completions, waitForTaskReplay: this.waitForTaskReplay, }); return; } } else { // Only log here so we don't break any existing behavior logger.debug("No waitForTaskReplay", { completions }); } this.waitForTaskReplay = undefined; break; } case "WAIT_FOR_BATCH": { if (this.waitForBatchReplay) { if (!this.waitForBatchReplay.message.runFriendlyIds.includes(firstCompletion.id)) { logger.error("Failed to resume after dependency: Batch friendlyId mismatch", { completions, waitForBatchReplay: this.waitForBatchReplay, }); return; } } else { // Only log here so we don't break any existing behavior logger.debug("No waitForBatchReplay", { completions }); } this.waitForBatchReplay = undefined; break; } } this.paused = false; this.nextResumeAfter = undefined; this.waitForPostStart = false; this.readyForResumeReplay = undefined; for (let i = 0; i < completions.length; i++) { const completion = completions[i]; if (!completion) continue; this._taskRunProcess?.taskRunCompletedNotification(completion); } }, RESUME_AFTER_DURATION: async (message) => { if (!this.paused) { logger.error("worker not paused", { attemptId: message.attemptId, }); return; } if (this.nextResumeAfter !== "WAIT_FOR_DURATION") { logger.error("not waiting to resume after duration", { nextResumeAfter: this.nextResumeAfter, }); return; } this.#resumeAfterDuration(); }, EXECUTE_TASK_RUN: async () => { // These messages should only be received by old workers that don't support lazy attempts this.#failRun(this.runId, "Received deprecated EXECUTE_TASK_RUN message. Please contact us if you see this error."); }, EXECUTE_TASK_RUN_LAZY_ATTEMPT: async (message) => { this.readyForLazyAttemptReplay = undefined; if (this.executing) { logger.error("dropping execute request, already executing"); return; } const attemptCount = message.lazyPayload.attemptCount ?? 0; logger.log("execute attempt counts", { attemptCount, completed: this.completed.size }); if (this.completed.size > 0 && this.completed.size >= attemptCount + 1) { logger.error("dropping execute request, already completed"); return; } this.executing = true; const createAttempt = await defaultBackoff.execute(async ({ retry }) => { logger.log("Create task run attempt with backoff", { retry, runId: message.lazyPayload.runId, }); return await this.#coordinatorSocket.socket .timeout(15_000) .emitWithAck("CREATE_TASK_RUN_ATTEMPT", { version: "v1", runId: message.lazyPayload.runId, }); }); logger.log("create attempt", { createAttempt }); if (!createAttempt.success) { this.#failRun(message.lazyPayload.runId, `Failed to create attempt: ${createAttempt.cause}. ${createAttempt.error}`); return; } if (!createAttempt.result.success) { this.#failRun(message.lazyPayload.runId, createAttempt.result.reason ?? "Failed to create attempt"); return; } await this.#killCurrentTaskRunProcessBeforeAttempt(); this.attemptFriendlyId = createAttempt.result.executionPayload.execution.attempt.id; this.attemptNumber = createAttempt.result.executionPayload.execution.attempt.number; const { execution } = createAttempt.result.executionPayload; const { environment } = message.lazyPayload; const env = { ...gatherProcessEnv(), ...environment, }; this._taskRunProcess = new TaskRunProcess({ workerManifest: this.workerManifest, env, serverWorker: execution.worker, payload: createAttempt.result.executionPayload, messageId: message.lazyPayload.messageId, }); this._taskRunProcess.onTaskRunHeartbeat.attach((heartbeatId) => { logger.log("onTaskRunHeartbeat", { heartbeatId, }); this.#coordinatorSocket.socket.volatile.emit("TASK_RUN_HEARTBEAT", { version: "v1", runId: heartbeatId, }); }); this._taskRunProcess.onWaitForDuration.attach(this.#handleOnWaitForDuration.bind(this)); this._taskRunProcess.onWaitForTask.attach(this.#handleOnWaitForTask.bind(this)); this._taskRunProcess.onWaitForBatch.attach(this.#handleOnWaitForBatch.bind(this)); logger.log("initializing task run process", { workerManifest: this.workerManifest, attemptId: execution.attempt.id, runId: execution.run.id, }); try { await this._taskRunProcess.initialize(); logger.log("executing task run process", { attemptId: execution.attempt.id, runId: execution.run.id, }); const completion = await this._taskRunProcess.execute(); logger.log("completed", completion); this.completed.add(execution.attempt.id); try { await this._taskRunProcess.cleanup(true); } catch (error) { logger.error("Failed to cleanup task run process, submitting completion anyway", { error, }); } await this.#submitAttemptCompletion(execution, completion); } catch (error) { logger.error("Failed to complete lazy attempt", { error, }); try { await this.#submitAttemptCompletion(execution, { id: execution.run.id, ok: false, retry: undefined, error: TaskRunProcess.parseExecuteError(error, !this.runningInKubernetes), }); } catch (error) { this.#failRun(message.lazyPayload.runId, error); } } }, REQUEST_ATTEMPT_CANCELLATION: async (message) => { if (!this.executing) { logger.log("dropping cancel request, not executing", { status: this.#status }); return; } logger.log("cancelling attempt", { attemptId: message.attemptId, status: this.#status }); await this._taskRunProcess?.cancel(); }, REQUEST_EXIT: async (message) => { if (message.version === "v2" && message.delayInMs) { logger.log("exit requested with delay", { delayInMs: message.delayInMs }); await timeout(message.delayInMs); } this.#coordinatorSocket.close(); process.exit(0); }, READY_FOR_RETRY: async (message) => { if (this.completed.size < 1) { logger.error("Received READY_FOR_RETRY but no completions yet. This is a bug."); return; } await this.#readyForLazyAttempt(); }, }, // MARK: ON CONNECTION onConnection: async (socket, handler, sender, logger) => { logger.log("connected to coordinator", { status: this.#status, connectionCount: ++this.connectionCount, }); // We need to send our current state to the coordinator socket.emit("SET_STATE", { version: "v1", attemptFriendlyId: this.attemptFriendlyId, attemptNumber: this.attemptNumber ? String(this.attemptNumber) : undefined, }); try { if (this.waitForPostStart) { logger.log("skip connection handler, waiting for post start hook"); return; } if (this.paused) { await this.#readyForResume(); return; } if (this.executing) { return; } process.removeAllListeners("uncaughtException"); process.on("uncaughtException", (error) => { console.error("Uncaught exception during run", error); this.#failRun(this.runId, error); }); await this.#readyForLazyAttempt(); } catch (error) { logger.error("connection handler error", { error }); } finally { if (this.connectionCount === 1) { // Skip replays if this is the first connection, including post start return; } // This is a reconnect, so handle replays this.#handleReplays(); } }, onError: async (socket, err, logger) => { logger.error("onError", { error: { name: err.name, message: err.message, }, }); }, }); return coordinatorConnection; } // MARK: Handle onWaitForDuration async #handleOnWaitForDuration(message) { logger.log("onWaitForDuration", { ...message, drift: Date.now() - message.now, }); if (this.nextResumeAfter) { logger.error("Already waiting for resume, skipping wait for duration", { nextResumeAfter: this.nextResumeAfter, }); return; } noResume: { const { ms, waitThresholdInMs } = message; const internalTimeout = unboundedTimeout(ms, "internal"); const checkpointSafeInternalTimeout = checkpointSafeTimeout(ms); if (ms < waitThresholdInMs) { await internalTimeout; break noResume; } const waitForDuration = await defaultBackoff.execute(async ({ retry }) => { logger.log("Wait for duration with backoff", { retry }); if (!this.attemptFriendlyId) { logger.error("Failed to send wait message, attempt friendly ID not set", { message }); throw new ExponentialBackoff.StopRetrying("No attempt ID"); } return await this.#coordinatorSocket.socket .timeout(20_000) .emitWithAck("WAIT_FOR_DURATION", { ...message, attemptFriendlyId: this.attemptFriendlyId, }); }); if (!waitForDuration.success) { logger.error("Failed to wait for duration with backoff", { cause: waitForDuration.cause, error: waitForDuration.error, }); this.#emitUnrecoverableError("WaitForDurationFailed", `${waitForDuration.cause}: ${waitForDuration.error}`); return; } const { willCheckpointAndRestore } = waitForDuration.result; if (!willCheckpointAndRestore) { await internalTimeout; break noResume; } await this.#prepareForWait("WAIT_FOR_DURATION", willCheckpointAndRestore); // CHECKPOINTING AFTER THIS LINE // internalTimeout acts as a backup and will be accurate if the checkpoint never happens // checkpointSafeInternalTimeout is accurate even after non-simulated restores await Promise.race([internalTimeout, checkpointSafeInternalTimeout]); const idempotencyKey = randomUUID(); this.durationResumeFallback = { idempotencyKey }; try { await this.restoreNotification.waitFor(5_000); } catch (error) { logger.error("Did not receive restore notification in time", { error, }); } try { // The coordinator should cancel any in-progress checkpoints so we don't end up with race conditions const { checkpointCanceled } = await this.#coordinatorSocket.socket .timeout(15_000) .emitWithAck("CANCEL_CHECKPOINT", { version: "v2", reason: "WAIT_FOR_DURATION", }); logger.log("onCancelCheckpoint coordinator response", { checkpointCanceled }); if (checkpointCanceled) { // If the checkpoint was canceled, we will never be resumed externally with RESUME_AFTER_DURATION, so it's safe to immediately resume break noResume; } logger.log("Waiting for external duration resume as we may have been restored"); setTimeout(() => { if (!this.durationResumeFallback) { logger.error("Already resumed after duration, skipping fallback"); return; } if (this.durationResumeFallback.idempotencyKey !== idempotencyKey) { logger.error("Duration resume idempotency key mismatch, skipping fallback"); return; } logger.log("Resuming after duration with fallback"); this.#resumeAfterDuration(); }, 15_000); } catch (error) { // Just log this for now, but don't automatically resume. Wait for the external checkpoint-based resume. logger.debug("Checkpoint cancellation timed out", { message, error, }); } return; } this.#resumeAfterDuration(); } // MARK: REPLAYS async #handleReplays() { const backoff = new ExponentialBackoff().type("FullJitter").maxRetries(3); const replayCancellationDelay = 20_000; if (this.waitForTaskReplay) { logger.log("replaying wait for task", { ...this.waitForTaskReplay }); const { idempotencyKey, message, attempt } = this.waitForTaskReplay; // Give the platform some time to send RESUME_AFTER_DEPENDENCY await timeout(replayCancellationDelay); if (!this.waitForTaskReplay) { logger.error("wait for task replay cancelled, discarding", { originalMessage: { idempotencyKey, message, attempt }, }); return; } if (idempotencyKey !== this.waitForTaskReplay.idempotencyKey) { logger.error("wait for task replay idempotency key mismatch, discarding", { originalMessage: { idempotencyKey, message, attempt }, newMessage: this.waitForTaskReplay, }); return; } try { await backoff.wait(attempt + 1); await this.#handleOnWaitForTask(message, idempotencyKey); } catch (error) { if (error instanceof ExponentialBackoff.RetryLimitExceeded) { logger.error("wait for task replay retry limit exceeded", { error }); } else { logger.error("wait for task replay error", { error }); } } return; } if (this.waitForBatchReplay) { logger.log("replaying wait for batch", { ...this.waitForBatchReplay, cancellationDelay: replayCancellationDelay, }); const { idempotencyKey, message, attempt } = this.waitForBatchReplay; // Give the platform some time to send RESUME_AFTER_DEPENDENCY await timeout(replayCancellationDelay); if (!this.waitForBatchReplay) { logger.error("wait for batch replay cancelled, discarding", { originalMessage: { idempotencyKey, message, attempt }, }); return; } if (idempotencyKey !== this.waitForBatchReplay.idempotencyKey) { logger.error("wait for batch replay idempotency key mismatch, discarding", { originalMessage: { idempotencyKey, message, attempt }, newMessage: this.waitForBatchReplay, }); return; } try { await backoff.wait(attempt + 1); await this.#handleOnWaitForBatch(message, idempotencyKey); } catch (error) { if (error instanceof ExponentialBackoff.RetryLimitExceeded) { logger.error("wait for batch replay retry limit exceeded", { error }); } else { logger.error("wait for batch replay error", { error }); } } return; } } async #killCurrentTaskRunProcessBeforeAttempt() { console.log("killCurrentTaskRunProcessBeforeAttempt()", { hasTaskRunProcess: !!this._taskRunProcess, }); if (!this._taskRunProcess) { return; } const currentTaskRunProcess = this._taskRunProcess; await currentTaskRunProcess.kill(); } // MARK: HTTP SERVER #createHttpServer() { const httpServer = createServer(async (req, res) => { logger.log(`[${req.method}]`, req.url); const reply = new HttpReply(res); try { const url = new URL(req.url ?? "", `http://${req.headers.host}`); switch (url.pathname) { case "/health": { return reply.text("ok"); } case "/status": { return reply.json(this.#status); } case "/connect": { this.#coordinatorSocket.connect(); return reply.text("Connected to coordinator"); } case "/close": { this.#coordinatorSocket.close(); this.connectionCount = 0; return reply.text("Disconnected from coordinator"); } case "/test": { await this.#coordinatorSocket.socket.timeout(10_000).emitWithAck("TEST", { version: "v1", }); return reply.text("Received ACK from coordinator"); } case "/preStop": { const cause = PreStopCauses.safeParse(url.searchParams.get("cause")); if (!cause.success) { logger.error("Failed to parse cause", { cause }); return reply.text("Failed to parse cause", 400); } switch (cause.data) { case "terminate": { break; } default: { logger.error("Unhandled cause", { cause: cause }); break; } } return reply.text("preStop ok"); } case "/postStart": { const cause = PostStartCauses.safeParse(url.searchParams.get("cause")); if (!cause.success) { logger.error("Failed to parse cause", { cause }); return reply.text("Failed to parse cause", 400); } switch (cause.data) { case "index": { break; } case "create": { break; } case "restore": { await this.#reconnectAfterPostStart(); this.restoreNotification.post(); break; } default: { logger.error("Unhandled cause", { cause: cause }); break; } } return reply.text("postStart ok"); } default: { return reply.empty(404); } } } catch (error) { logger.error("HTTP server error", { error }); reply.empty(500); } return; }); httpServer.on("clientError", (err, socket) => { socket.end("HTTP/1.1 400 Bad Request\r\n\r\n"); }); httpServer.on("listening", () => { logger.log("http server listening on port", this.#httpPort); }); httpServer.on("error", async (error) => { // @ts-expect-error if (error.code != "EADDRINUSE") { return; } logger.error(`port ${this.#httpPort} already in use, retrying with random port..`); this.#httpPort = getRandomPortNumber(); await timeout(100); this.start(); }); return httpServer; } get #status() { return { executing: this.executing, paused: this.paused, completed: this.completed.size, nextResumeAfter: this.nextResumeAfter, waitForPostStart: this.waitForPostStart, attemptFriendlyId: this.attemptFriendlyId, attemptNumber: this.attemptNumber, waitForTaskReplay: this.waitForTaskReplay, waitForBatchReplay: this.waitForBatchReplay, readyForLazyAttemptReplay: this.readyForLazyAttemptReplay, durationResumeFallback: this.durationResumeFallback, readyForResumeReplay: this.readyForResumeReplay, }; } #emitUnrecoverableError(name, message) { this.#coordinatorSocket.socket.emit("UNRECOVERABLE_ERROR", { version: "v1", error: { name, message, }, }); } async start() { this.#httpServer.listen(this.#httpPort, this.host); } } const workerManifest = await loadWorkerManifest(); const prodWorker = new ProdWorker(HTTP_SERVER_PORT, workerManifest); await prodWorker.start(); function gatherProcessEnv() { const $env = { NODE_ENV: env.NODE_ENV ?? "production", NODE_EXTRA_CA_CERTS: env.NODE_EXTRA_CA_CERTS, OTEL_EXPORTER_OTLP_ENDPOINT: env.OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://0.0.0.0:4318", }; // Filter out undefined values return Object.fromEntries(Object.entries($env).filter(([key, value]) => value !== undefined)); } async function loadWorkerManifest() { const manifestContents = await readFile("./index.json", "utf-8"); const raw = JSON.parse(manifestContents); return WorkerManifest.pars