UNPKG

@convex-dev/workpool

Version:

A Convex component for managing async work.

503 lines 21.1 kB
import { v } from "convex/values"; import { internal } from "./_generated/api.js"; import { internalMutation } from "./_generated/server.js"; import { createLogger, DEFAULT_LOG_LEVEL, } from "./logging.js"; import { boundScheduledTime, DEFAULT_MAX_PARALLELISM, fromSegment, getCurrentSegment, getNextSegment, max, toSegment, } from "./shared.js"; import { recordCompleted, generateReport, recordStarted } from "./stats.js"; const CANCELLATION_BATCH_SIZE = 64; // the only queue that can get unbounded. const SECOND = 1000; const MINUTE = 60 * SECOND; const RECOVERY_THRESHOLD_MS = 5 * MINUTE; // attempt to recover jobs this old. export const RECOVERY_PERIOD_SEGMENTS = toSegment(1 * MINUTE); // how often to check. const CURSOR_BUFFER_SEGMENTS = toSegment(30 * SECOND); // buffer for cursor updates. export const INITIAL_STATE = { generation: 0n, segmentCursors: { incoming: 0n, completion: 0n, cancelation: 0n }, lastRecovery: 0n, report: { completed: 0, succeeded: 0, failed: 0, retries: 0, canceled: 0, lastReportTs: 0, }, running: [], }; // There should only ever be at most one of these scheduled or running. export const main = internalMutation({ args: { generation: v.int64(), segment: v.int64() }, handler: async (ctx, { generation, segment }) => { // State will be modified and patched at the end of the function. const state = await getOrCreateState(ctx); if (generation !== state.generation) { throw new Error(`generation mismatch: ${generation} !== ${state.generation}`); } state.generation++; const runStatus = await getOrCreateRunningStatus(ctx); if (runStatus.state.kind !== "running") { await ctx.db.patch(runStatus._id, { state: { kind: "running" }, }); } const globals = await getGlobals(ctx); const console = createLogger(globals.logLevel); const delayMs = Date.now() - fromSegment(segment); console.debug(`[main] generation ${generation} behind: ${delayMs}ms`); // Read pendingCompletions, including retry handling. console.time("[main] pendingCompletion"); const toCancel = await handleCompletions(ctx, state, segment, console); console.timeEnd("[main] pendingCompletion"); // Read pendingCancelation, deleting from pendingStart. If it's still running, queue to cancel. console.time("[main] pendingCancelation"); await handleCancelation(ctx, state, segment, console, toCancel); console.timeEnd("[main] pendingCancelation"); if (state.running.length === 0) { // If there's nothing active, reset lastRecovery. state.lastRecovery = segment; } else if (segment - state.lastRecovery >= RECOVERY_PERIOD_SEGMENTS) { // Otherwise schedule recovery for any old jobs. await handleRecovery(ctx, state, console); state.lastRecovery = segment; } // Read pendingStart up to max capacity. Update the config, and incomingSegmentCursor. console.time("[main] pendingStart"); await handleStart(ctx, state, segment, console, globals); console.timeEnd("[main] pendingStart"); if (Date.now() - state.report.lastReportTs >= MINUTE) { // If minute rollover since last report, log report. // Try to avoid clock skew by shifting by a minute. let lastReportTs = state.report.lastReportTs + MINUTE; if (Date.now() > lastReportTs + MINUTE / 2) { // It's been a while, let's start fresh. lastReportTs = Date.now(); } await generateReport(ctx, console, state, globals); state.report = { completed: 0, succeeded: 0, failed: 0, retries: 0, canceled: 0, lastReportTs, }; } await ctx.db.replace(state._id, state); await ctx.scheduler.runAfter(0, internal.loop.updateRunStatus, { generation: state.generation, segment, }); // TODO: if there were more cancellations, schedule main directly. }, }); export const updateRunStatus = internalMutation({ args: { generation: v.int64(), segment: v.int64() }, handler: async (ctx, { generation, segment }) => { const globals = await getGlobals(ctx); const console = createLogger(globals.logLevel); const maxParallelism = globals.maxParallelism; const state = await getOrCreateState(ctx); if (generation !== state.generation) { throw new Error(`generation mismatch: ${generation} !== ${state.generation}`); } console.time("[updateRunStatus] outstandingCancelations"); const outstandingCancelations = await getNextUp(ctx, "pendingCancelation", { start: state.segmentCursors.cancelation, end: segment, }); console.timeEnd("[updateRunStatus] outstandingCancelations"); if (outstandingCancelations) { await ctx.scheduler.runAfter(0, internal.loop.main, { generation, segment, }); return; } // TODO: check for current segment (or from args) first, to avoid OCCs. console.time("[updateRunStatus] nextSegmentIsActionable"); const nextSegment = max(segment + 1n, getCurrentSegment()); const nextIsActionable = await nextSegmentIsActionable(ctx, state, maxParallelism, nextSegment); console.timeEnd("[updateRunStatus] nextSegmentIsActionable"); if (nextIsActionable) { await ctx.scheduler.runAt(boundScheduledTime(fromSegment(nextSegment), console), internal.loop.main, { generation, segment: nextSegment, }); return; } console.time("[updateRunStatus] oldSegmentIsActionable"); const [oldIsActionable, cursors] = await oldSegmentIsActionable(ctx, state, maxParallelism); console.timeEnd("[updateRunStatus] oldSegmentIsActionable"); if (oldIsActionable) { await ctx.db.patch(state._id, { segmentCursors: { ...state.segmentCursors, ...cursors, }, }); await ctx.scheduler.runAfter(0, internal.loop.main, { generation, segment: getCurrentSegment(), }); return; } // Find next actionable segment (min next segment). console.time("[updateRunStatus] findNextSegment"); const actionableTables = ["pendingCompletion", "pendingCancelation"]; if (state.running.length < maxParallelism) { actionableTables.push("pendingStart"); } const docs = await Promise.all(actionableTables.map(async (tableName) => getNextUp(ctx, tableName, { start: nextSegment }))); console.timeEnd("[updateRunStatus] findNextSegment"); let targetSegment = docs.map((d) => d?.segment).sort()[0]; const runStatus = await getOrCreateRunningStatus(ctx); const saturated = state.running.length >= maxParallelism; if (targetSegment !== undefined || state.running.length > 0) { // If there's something to do, schedule for next actionable segment. // Or the next recovery, whichever comes first. const nextRecoverySegment = state.lastRecovery + RECOVERY_PERIOD_SEGMENTS; if (!targetSegment || targetSegment > nextRecoverySegment) { targetSegment = nextRecoverySegment; } const scheduledId = await ctx.scheduler.runAt(boundScheduledTime(fromSegment(targetSegment), console), internal.loop.main, { generation, segment: targetSegment }); if (targetSegment > getNextSegment()) { await ctx.db.patch(runStatus._id, { state: { kind: "scheduled", scheduledId, saturated, generation, segment: targetSegment, }, }); } else { console.debug(`[updateRunStatus] staying running because it's the next segment`); } return; } // There seems to be nothing in the future to do, so go idle. await ctx.db.patch(runStatus._id, { state: { kind: "idle", generation }, }); }, }); async function nextSegmentIsActionable(ctx, state, maxParallelism, end) { // First, try with our cursor range, up to end. if (await getNextUp(ctx, "pendingCancelation", { start: state.segmentCursors.cancelation, end, })) { return true; } if (await getNextUp(ctx, "pendingCompletion", { start: state.segmentCursors.completion, end, })) { return true; } if (state.running.length < maxParallelism) { if (await getNextUp(ctx, "pendingStart", { start: state.segmentCursors.incoming, end, })) { return true; } } return false; } async function oldSegmentIsActionable(ctx, state, maxParallelism) { // Next, we look for out-of-order additions we may have missed. const oldCompletion = await getNextUp(ctx, "pendingCompletion", { end: state.segmentCursors.completion, }); if (oldCompletion) { return [true, { completion: oldCompletion.segment }]; } const oldCancelation = await getNextUp(ctx, "pendingCancelation", { end: state.segmentCursors.cancelation, }); if (oldCancelation) { return [true, { cancelation: oldCancelation.segment }]; } if (state.running.length < maxParallelism) { const oldStart = await getNextUp(ctx, "pendingStart", { end: state.segmentCursors.incoming, }); if (oldStart) { return [true, { incoming: oldStart.segment }]; } } return [false, {}]; } // Fetch the next item. If only one of start & end are provided, it's exclusive. async function getNextUp(ctx, table, range) { return ctx.db .query(table) .withIndex("segment", (q) => range.start !== undefined ? range.end !== undefined ? q .gte("segment", range.start - CURSOR_BUFFER_SEGMENTS) .lte("segment", range.end) : q.gt("segment", range.start - CURSOR_BUFFER_SEGMENTS) : range.end !== undefined ? q.lt("segment", range.end) : q) .first(); } /** * Handles the completion of pending completions. * This only processes work that succeeded or failed, not canceled. */ async function handleCompletions(ctx, state, segment, console) { const startSegment = state.segmentCursors.completion - CURSOR_BUFFER_SEGMENTS; // This won't be too many because the jobs all correspond to being scheduled // by a single main (the previous one), so they're limited by MAX_PARALLELISM. const completed = await ctx.db .query("pendingCompletion") .withIndex("segment", (q) => q.gte("segment", startSegment).lte("segment", segment)) .collect(); state.segmentCursors.completion = segment; // Completions that were going to be retried but have since been canceled. const toCancel = []; await Promise.all(completed.map(async (c) => { await ctx.db.delete(c._id); const running = state.running.find((r) => r.workId === c.workId); if (!running) { console.error(`[main] completing ${c.workId} but it's not in "running"`); return; } if (c.retry) { // Only check for work if it's going to be retried. const work = await ctx.db.get(c.workId); if (!work) { console.warn(`[main] ${c.workId} is gone, but trying to complete`); return; } const retried = await rescheduleJob(ctx, work, console); if (retried) { state.report.retries++; recordCompleted(console, work, "retrying"); } else { // We don't retry if it's been canceled in the mean time. state.report.canceled++; toCancel.push({ workId: c.workId, runResult: { kind: "canceled" }, attempt: work.attempts, }); } } else { if (c.runResult.kind === "success") { state.report.succeeded++; } else if (c.runResult.kind === "failed") { state.report.failed++; } } })); // We do this after so the stats above know if it was in progress. const before = state.running.length; state.running = state.running.filter((r) => !completed.some((c) => c.workId === r.workId)); const numCompleted = before - state.running.length; state.report.completed += numCompleted; console.debug(`[main] completed ${numCompleted} work`); return toCancel; } async function handleCancelation(ctx, state, segment, console, toCancel) { const start = state.segmentCursors.cancelation - CURSOR_BUFFER_SEGMENTS; const canceled = await ctx.db .query("pendingCancelation") .withIndex("segment", (q) => q.gte("segment", start).lte("segment", segment)) .take(CANCELLATION_BATCH_SIZE); state.segmentCursors.cancelation = canceled.at(-1)?.segment ?? segment; if (canceled.length) { console.debug(`[main] attempting to cancel ${canceled.length}`); } const canceledWork = new Set(); const runResult = { kind: "canceled" }; const jobs = toCancel.concat(...(await Promise.all(canceled.map(async ({ _id, _creationTime, workId }) => { await ctx.db.delete(_id); if (canceledWork.has(workId)) { // We shouldn't have multiple pending cancelations for the same work. console.error(`[main] ${workId} already canceled`); return null; } const work = await ctx.db.get(workId); if (!work) { console.warn(`[main] ${workId} is gone, but trying to cancel`); return null; } // Ensure it doesn't retry. await ctx.db.patch(workId, { canceled: true }); // Ensure it doesn't start. const pendingStart = await ctx.db .query("pendingStart") .withIndex("workId", (q) => q.eq("workId", workId)) .unique(); if (pendingStart && !canceledWork.has(workId)) { state.report.canceled++; await ctx.db.delete(pendingStart._id); canceledWork.add(workId); return { workId, runResult, attempt: work.attempts }; } return null; }))).flatMap((r) => (r ? [r] : []))); if (jobs.length) { await ctx.scheduler.runAfter(0, internal.complete.complete, { jobs }); } } async function handleRecovery(ctx, state, console) { const missing = new Set(); const oldEnoughToConsider = Date.now() - RECOVERY_THRESHOLD_MS; const jobs = (await Promise.all(state.running.map(async (r) => { if (r.started >= oldEnoughToConsider) { return null; } const work = await ctx.db.get(r.workId); if (!work) { missing.add(r.workId); console.error(`[main] ${r.workId} already gone (skipping recovery)`); return null; } return { ...r, attempt: work.attempts }; }))).flatMap((r) => (r ? [r] : [])); state.running = state.running.filter((r) => !missing.has(r.workId)); if (jobs.length) { await ctx.scheduler.runAfter(0, internal.recovery.recover, { jobs }); } } async function handleStart(ctx, state, segment, console, { maxParallelism, logLevel }) { // Schedule as many as needed to reach maxParallelism. const toSchedule = maxParallelism - state.running.length; const pending = toSchedule > 0 ? await ctx.db .query("pendingStart") .withIndex("segment", (q) => q .gte("segment", state.segmentCursors.incoming - CURSOR_BUFFER_SEGMENTS) .lte("segment", segment)) .take(toSchedule) : []; if (pending) { if (pending.length > 0) { state.segmentCursors.incoming = pending.at(-1).segment; } else if (toSchedule > 0) { // We have no more pending work, update to now state.segmentCursors.incoming = segment; } } console.debug(`[main] scheduling ${pending.length} pending work`); // Start new work. state.running.push(...(await Promise.all(pending.map(async ({ _id, workId, segment }) => { if (state.running.some((r) => r.workId === workId)) { console.error(`[main] ${workId} already running (skipping start)`); return null; } const lagMs = Date.now() - fromSegment(segment); const scheduledId = await beginWork(ctx, workId, logLevel, lagMs); await ctx.db.delete(_id); return { scheduledId, workId, started: Date.now() }; }))).flatMap((r) => (r ? [r] : []))); } async function beginWork(ctx, workId, logLevel, lagMs) { const console = createLogger(logLevel); const work = await ctx.db.get(workId); if (!work) { throw new Error("work not found"); } recordStarted(console, work, lagMs); const { attempts: attempt, fnHandle, fnArgs } = work; const args = { workId, fnHandle, fnArgs, logLevel, attempt }; if (work.fnType === "action") { return ctx.scheduler.runAfter(0, internal.worker.runActionWrapper, args); } else if (work.fnType === "mutation" || work.fnType === "query") { return ctx.scheduler.runAfter(0, internal.worker.runMutationWrapper, { ...args, fnType: work.fnType, }); } else { throw new Error(`Unexpected fnType ${work.fnType}`); } } /** * Reschedules a job for retry. * If it's been canceled in the mean time, don't retry. * @returns true if the job was rescheduled, false if it was not. */ async function rescheduleJob(ctx, work, console) { const pendingCancelation = await ctx.db .query("pendingCancelation") .withIndex("workId", (q) => q.eq("workId", work._id)) .unique(); if (pendingCancelation) { // If there's an un-processed cancelation request, don't retry. console.warn(`[main] ${work._id} in pendingCancelation so not retrying`); return false; } if (work.canceled) { return false; } if (!work.retryBehavior) { console.warn(`[main] ${work._id} has no retryBehavior so not retrying`); return false; } const existing = await ctx.db .query("pendingStart") .withIndex("workId", (q) => q.eq("workId", work._id)) .first(); if (existing) { // Not sure why this would ever happen, but ensure uniqueness explicitly. console.error(`[main] ${work._id} already in pendingStart so not retrying`); return false; } const backoffMs = work.retryBehavior.initialBackoffMs * Math.pow(work.retryBehavior.base, work.attempts - 1); const nextAttempt = withJitter(backoffMs); const startTime = boundScheduledTime(Date.now() + nextAttempt, console); const segment = toSegment(startTime); await ctx.db.insert("pendingStart", { workId: work._id, segment, }); return true; } export function withJitter(delay) { return delay * (0.5 + Math.random()); } async function getGlobals(ctx) { const globals = await ctx.db.query("globals").unique(); if (!globals) { return { maxParallelism: DEFAULT_MAX_PARALLELISM, logLevel: DEFAULT_LOG_LEVEL, }; } return globals; } async function getOrCreateState(ctx) { const state = await ctx.db.query("internalState").unique(); if (state) return state; const globals = await getGlobals(ctx); const console = createLogger(globals.logLevel); console.error("No internalState in running loop! Re-creating empty one..."); return (await ctx.db.get(await ctx.db.insert("internalState", INITIAL_STATE))); } async function getOrCreateRunningStatus(ctx) { const runStatus = await ctx.db.query("runStatus").unique(); if (runStatus) return runStatus; const globals = await getGlobals(ctx); const console = createLogger(globals.logLevel); console.error("No runStatus in running loop! Re-creating one..."); return (await ctx.db.get(await ctx.db.insert("runStatus", { state: { kind: "running" } }))); } // eslint-disable-next-line @typescript-eslint/no-unused-vars const console = "THIS IS A REMINDER TO USE createLogger"; //# sourceMappingURL=loop.js.map