@convex-dev/workpool
Version:
A Convex component for managing async work.
503 lines • 21.1 kB
JavaScript
import { v } from "convex/values";
import { internal } from "./_generated/api.js";
import { internalMutation } from "./_generated/server.js";
import { createLogger, DEFAULT_LOG_LEVEL, } from "./logging.js";
import { boundScheduledTime, DEFAULT_MAX_PARALLELISM, fromSegment, getCurrentSegment, getNextSegment, max, toSegment, } from "./shared.js";
import { recordCompleted, generateReport, recordStarted } from "./stats.js";
const CANCELLATION_BATCH_SIZE = 64; // the only queue that can get unbounded.
const SECOND = 1000;
const MINUTE = 60 * SECOND;
const RECOVERY_THRESHOLD_MS = 5 * MINUTE; // attempt to recover jobs this old.
export const RECOVERY_PERIOD_SEGMENTS = toSegment(1 * MINUTE); // how often to check.
const CURSOR_BUFFER_SEGMENTS = toSegment(30 * SECOND); // buffer for cursor updates.
export const INITIAL_STATE = {
generation: 0n,
segmentCursors: { incoming: 0n, completion: 0n, cancelation: 0n },
lastRecovery: 0n,
report: {
completed: 0,
succeeded: 0,
failed: 0,
retries: 0,
canceled: 0,
lastReportTs: 0,
},
running: [],
};
// There should only ever be at most one of these scheduled or running.
export const main = internalMutation({
args: { generation: v.int64(), segment: v.int64() },
handler: async (ctx, { generation, segment }) => {
// State will be modified and patched at the end of the function.
const state = await getOrCreateState(ctx);
if (generation !== state.generation) {
throw new Error(`generation mismatch: ${generation} !== ${state.generation}`);
}
state.generation++;
const runStatus = await getOrCreateRunningStatus(ctx);
if (runStatus.state.kind !== "running") {
await ctx.db.patch(runStatus._id, {
state: { kind: "running" },
});
}
const globals = await getGlobals(ctx);
const console = createLogger(globals.logLevel);
const delayMs = Date.now() - fromSegment(segment);
console.debug(`[main] generation ${generation} behind: ${delayMs}ms`);
// Read pendingCompletions, including retry handling.
console.time("[main] pendingCompletion");
const toCancel = await handleCompletions(ctx, state, segment, console);
console.timeEnd("[main] pendingCompletion");
// Read pendingCancelation, deleting from pendingStart. If it's still running, queue to cancel.
console.time("[main] pendingCancelation");
await handleCancelation(ctx, state, segment, console, toCancel);
console.timeEnd("[main] pendingCancelation");
if (state.running.length === 0) {
// If there's nothing active, reset lastRecovery.
state.lastRecovery = segment;
}
else if (segment - state.lastRecovery >= RECOVERY_PERIOD_SEGMENTS) {
// Otherwise schedule recovery for any old jobs.
await handleRecovery(ctx, state, console);
state.lastRecovery = segment;
}
// Read pendingStart up to max capacity. Update the config, and incomingSegmentCursor.
console.time("[main] pendingStart");
await handleStart(ctx, state, segment, console, globals);
console.timeEnd("[main] pendingStart");
if (Date.now() - state.report.lastReportTs >= MINUTE) {
// If minute rollover since last report, log report.
// Try to avoid clock skew by shifting by a minute.
let lastReportTs = state.report.lastReportTs + MINUTE;
if (Date.now() > lastReportTs + MINUTE / 2) {
// It's been a while, let's start fresh.
lastReportTs = Date.now();
}
await generateReport(ctx, console, state, globals);
state.report = {
completed: 0,
succeeded: 0,
failed: 0,
retries: 0,
canceled: 0,
lastReportTs,
};
}
await ctx.db.replace(state._id, state);
await ctx.scheduler.runAfter(0, internal.loop.updateRunStatus, {
generation: state.generation,
segment,
});
// TODO: if there were more cancellations, schedule main directly.
},
});
export const updateRunStatus = internalMutation({
args: { generation: v.int64(), segment: v.int64() },
handler: async (ctx, { generation, segment }) => {
const globals = await getGlobals(ctx);
const console = createLogger(globals.logLevel);
const maxParallelism = globals.maxParallelism;
const state = await getOrCreateState(ctx);
if (generation !== state.generation) {
throw new Error(`generation mismatch: ${generation} !== ${state.generation}`);
}
console.time("[updateRunStatus] outstandingCancelations");
const outstandingCancelations = await getNextUp(ctx, "pendingCancelation", {
start: state.segmentCursors.cancelation,
end: segment,
});
console.timeEnd("[updateRunStatus] outstandingCancelations");
if (outstandingCancelations) {
await ctx.scheduler.runAfter(0, internal.loop.main, {
generation,
segment,
});
return;
}
// TODO: check for current segment (or from args) first, to avoid OCCs.
console.time("[updateRunStatus] nextSegmentIsActionable");
const nextSegment = max(segment + 1n, getCurrentSegment());
const nextIsActionable = await nextSegmentIsActionable(ctx, state, maxParallelism, nextSegment);
console.timeEnd("[updateRunStatus] nextSegmentIsActionable");
if (nextIsActionable) {
await ctx.scheduler.runAt(boundScheduledTime(fromSegment(nextSegment), console), internal.loop.main, {
generation,
segment: nextSegment,
});
return;
}
console.time("[updateRunStatus] oldSegmentIsActionable");
const [oldIsActionable, cursors] = await oldSegmentIsActionable(ctx, state, maxParallelism);
console.timeEnd("[updateRunStatus] oldSegmentIsActionable");
if (oldIsActionable) {
await ctx.db.patch(state._id, {
segmentCursors: {
...state.segmentCursors,
...cursors,
},
});
await ctx.scheduler.runAfter(0, internal.loop.main, {
generation,
segment: getCurrentSegment(),
});
return;
}
// Find next actionable segment (min next segment).
console.time("[updateRunStatus] findNextSegment");
const actionableTables = ["pendingCompletion", "pendingCancelation"];
if (state.running.length < maxParallelism) {
actionableTables.push("pendingStart");
}
const docs = await Promise.all(actionableTables.map(async (tableName) => getNextUp(ctx, tableName, { start: nextSegment })));
console.timeEnd("[updateRunStatus] findNextSegment");
let targetSegment = docs.map((d) => d?.segment).sort()[0];
const runStatus = await getOrCreateRunningStatus(ctx);
const saturated = state.running.length >= maxParallelism;
if (targetSegment !== undefined || state.running.length > 0) {
// If there's something to do, schedule for next actionable segment.
// Or the next recovery, whichever comes first.
const nextRecoverySegment = state.lastRecovery + RECOVERY_PERIOD_SEGMENTS;
if (!targetSegment || targetSegment > nextRecoverySegment) {
targetSegment = nextRecoverySegment;
}
const scheduledId = await ctx.scheduler.runAt(boundScheduledTime(fromSegment(targetSegment), console), internal.loop.main, { generation, segment: targetSegment });
if (targetSegment > getNextSegment()) {
await ctx.db.patch(runStatus._id, {
state: {
kind: "scheduled",
scheduledId,
saturated,
generation,
segment: targetSegment,
},
});
}
else {
console.debug(`[updateRunStatus] staying running because it's the next segment`);
}
return;
}
// There seems to be nothing in the future to do, so go idle.
await ctx.db.patch(runStatus._id, {
state: { kind: "idle", generation },
});
},
});
async function nextSegmentIsActionable(ctx, state, maxParallelism, end) {
// First, try with our cursor range, up to end.
if (await getNextUp(ctx, "pendingCancelation", {
start: state.segmentCursors.cancelation,
end,
})) {
return true;
}
if (await getNextUp(ctx, "pendingCompletion", {
start: state.segmentCursors.completion,
end,
})) {
return true;
}
if (state.running.length < maxParallelism) {
if (await getNextUp(ctx, "pendingStart", {
start: state.segmentCursors.incoming,
end,
})) {
return true;
}
}
return false;
}
async function oldSegmentIsActionable(ctx, state, maxParallelism) {
// Next, we look for out-of-order additions we may have missed.
const oldCompletion = await getNextUp(ctx, "pendingCompletion", {
end: state.segmentCursors.completion,
});
if (oldCompletion) {
return [true, { completion: oldCompletion.segment }];
}
const oldCancelation = await getNextUp(ctx, "pendingCancelation", {
end: state.segmentCursors.cancelation,
});
if (oldCancelation) {
return [true, { cancelation: oldCancelation.segment }];
}
if (state.running.length < maxParallelism) {
const oldStart = await getNextUp(ctx, "pendingStart", {
end: state.segmentCursors.incoming,
});
if (oldStart) {
return [true, { incoming: oldStart.segment }];
}
}
return [false, {}];
}
// Fetch the next item. If only one of start & end are provided, it's exclusive.
async function getNextUp(ctx, table, range) {
return ctx.db
.query(table)
.withIndex("segment", (q) => range.start !== undefined
? range.end !== undefined
? q
.gte("segment", range.start - CURSOR_BUFFER_SEGMENTS)
.lte("segment", range.end)
: q.gt("segment", range.start - CURSOR_BUFFER_SEGMENTS)
: range.end !== undefined
? q.lt("segment", range.end)
: q)
.first();
}
/**
* Handles the completion of pending completions.
* This only processes work that succeeded or failed, not canceled.
*/
async function handleCompletions(ctx, state, segment, console) {
const startSegment = state.segmentCursors.completion - CURSOR_BUFFER_SEGMENTS;
// This won't be too many because the jobs all correspond to being scheduled
// by a single main (the previous one), so they're limited by MAX_PARALLELISM.
const completed = await ctx.db
.query("pendingCompletion")
.withIndex("segment", (q) => q.gte("segment", startSegment).lte("segment", segment))
.collect();
state.segmentCursors.completion = segment;
// Completions that were going to be retried but have since been canceled.
const toCancel = [];
await Promise.all(completed.map(async (c) => {
await ctx.db.delete(c._id);
const running = state.running.find((r) => r.workId === c.workId);
if (!running) {
console.error(`[main] completing ${c.workId} but it's not in "running"`);
return;
}
if (c.retry) {
// Only check for work if it's going to be retried.
const work = await ctx.db.get(c.workId);
if (!work) {
console.warn(`[main] ${c.workId} is gone, but trying to complete`);
return;
}
const retried = await rescheduleJob(ctx, work, console);
if (retried) {
state.report.retries++;
recordCompleted(console, work, "retrying");
}
else {
// We don't retry if it's been canceled in the mean time.
state.report.canceled++;
toCancel.push({
workId: c.workId,
runResult: { kind: "canceled" },
attempt: work.attempts,
});
}
}
else {
if (c.runResult.kind === "success") {
state.report.succeeded++;
}
else if (c.runResult.kind === "failed") {
state.report.failed++;
}
}
}));
// We do this after so the stats above know if it was in progress.
const before = state.running.length;
state.running = state.running.filter((r) => !completed.some((c) => c.workId === r.workId));
const numCompleted = before - state.running.length;
state.report.completed += numCompleted;
console.debug(`[main] completed ${numCompleted} work`);
return toCancel;
}
async function handleCancelation(ctx, state, segment, console, toCancel) {
const start = state.segmentCursors.cancelation - CURSOR_BUFFER_SEGMENTS;
const canceled = await ctx.db
.query("pendingCancelation")
.withIndex("segment", (q) => q.gte("segment", start).lte("segment", segment))
.take(CANCELLATION_BATCH_SIZE);
state.segmentCursors.cancelation = canceled.at(-1)?.segment ?? segment;
if (canceled.length) {
console.debug(`[main] attempting to cancel ${canceled.length}`);
}
const canceledWork = new Set();
const runResult = { kind: "canceled" };
const jobs = toCancel.concat(...(await Promise.all(canceled.map(async ({ _id, _creationTime, workId }) => {
await ctx.db.delete(_id);
if (canceledWork.has(workId)) {
// We shouldn't have multiple pending cancelations for the same work.
console.error(`[main] ${workId} already canceled`);
return null;
}
const work = await ctx.db.get(workId);
if (!work) {
console.warn(`[main] ${workId} is gone, but trying to cancel`);
return null;
}
// Ensure it doesn't retry.
await ctx.db.patch(workId, { canceled: true });
// Ensure it doesn't start.
const pendingStart = await ctx.db
.query("pendingStart")
.withIndex("workId", (q) => q.eq("workId", workId))
.unique();
if (pendingStart && !canceledWork.has(workId)) {
state.report.canceled++;
await ctx.db.delete(pendingStart._id);
canceledWork.add(workId);
return { workId, runResult, attempt: work.attempts };
}
return null;
}))).flatMap((r) => (r ? [r] : [])));
if (jobs.length) {
await ctx.scheduler.runAfter(0, internal.complete.complete, { jobs });
}
}
async function handleRecovery(ctx, state, console) {
const missing = new Set();
const oldEnoughToConsider = Date.now() - RECOVERY_THRESHOLD_MS;
const jobs = (await Promise.all(state.running.map(async (r) => {
if (r.started >= oldEnoughToConsider) {
return null;
}
const work = await ctx.db.get(r.workId);
if (!work) {
missing.add(r.workId);
console.error(`[main] ${r.workId} already gone (skipping recovery)`);
return null;
}
return { ...r, attempt: work.attempts };
}))).flatMap((r) => (r ? [r] : []));
state.running = state.running.filter((r) => !missing.has(r.workId));
if (jobs.length) {
await ctx.scheduler.runAfter(0, internal.recovery.recover, { jobs });
}
}
async function handleStart(ctx, state, segment, console, { maxParallelism, logLevel }) {
// Schedule as many as needed to reach maxParallelism.
const toSchedule = maxParallelism - state.running.length;
const pending = toSchedule > 0
? await ctx.db
.query("pendingStart")
.withIndex("segment", (q) => q
.gte("segment", state.segmentCursors.incoming - CURSOR_BUFFER_SEGMENTS)
.lte("segment", segment))
.take(toSchedule)
: [];
if (pending) {
if (pending.length > 0) {
state.segmentCursors.incoming = pending.at(-1).segment;
}
else if (toSchedule > 0) {
// We have no more pending work, update to now
state.segmentCursors.incoming = segment;
}
}
console.debug(`[main] scheduling ${pending.length} pending work`);
// Start new work.
state.running.push(...(await Promise.all(pending.map(async ({ _id, workId, segment }) => {
if (state.running.some((r) => r.workId === workId)) {
console.error(`[main] ${workId} already running (skipping start)`);
return null;
}
const lagMs = Date.now() - fromSegment(segment);
const scheduledId = await beginWork(ctx, workId, logLevel, lagMs);
await ctx.db.delete(_id);
return { scheduledId, workId, started: Date.now() };
}))).flatMap((r) => (r ? [r] : [])));
}
async function beginWork(ctx, workId, logLevel, lagMs) {
const console = createLogger(logLevel);
const work = await ctx.db.get(workId);
if (!work) {
throw new Error("work not found");
}
recordStarted(console, work, lagMs);
const { attempts: attempt, fnHandle, fnArgs } = work;
const args = { workId, fnHandle, fnArgs, logLevel, attempt };
if (work.fnType === "action") {
return ctx.scheduler.runAfter(0, internal.worker.runActionWrapper, args);
}
else if (work.fnType === "mutation" || work.fnType === "query") {
return ctx.scheduler.runAfter(0, internal.worker.runMutationWrapper, {
...args,
fnType: work.fnType,
});
}
else {
throw new Error(`Unexpected fnType ${work.fnType}`);
}
}
/**
* Reschedules a job for retry.
* If it's been canceled in the mean time, don't retry.
* @returns true if the job was rescheduled, false if it was not.
*/
async function rescheduleJob(ctx, work, console) {
const pendingCancelation = await ctx.db
.query("pendingCancelation")
.withIndex("workId", (q) => q.eq("workId", work._id))
.unique();
if (pendingCancelation) {
// If there's an un-processed cancelation request, don't retry.
console.warn(`[main] ${work._id} in pendingCancelation so not retrying`);
return false;
}
if (work.canceled) {
return false;
}
if (!work.retryBehavior) {
console.warn(`[main] ${work._id} has no retryBehavior so not retrying`);
return false;
}
const existing = await ctx.db
.query("pendingStart")
.withIndex("workId", (q) => q.eq("workId", work._id))
.first();
if (existing) {
// Not sure why this would ever happen, but ensure uniqueness explicitly.
console.error(`[main] ${work._id} already in pendingStart so not retrying`);
return false;
}
const backoffMs = work.retryBehavior.initialBackoffMs *
Math.pow(work.retryBehavior.base, work.attempts - 1);
const nextAttempt = withJitter(backoffMs);
const startTime = boundScheduledTime(Date.now() + nextAttempt, console);
const segment = toSegment(startTime);
await ctx.db.insert("pendingStart", {
workId: work._id,
segment,
});
return true;
}
export function withJitter(delay) {
return delay * (0.5 + Math.random());
}
async function getGlobals(ctx) {
const globals = await ctx.db.query("globals").unique();
if (!globals) {
return {
maxParallelism: DEFAULT_MAX_PARALLELISM,
logLevel: DEFAULT_LOG_LEVEL,
};
}
return globals;
}
async function getOrCreateState(ctx) {
const state = await ctx.db.query("internalState").unique();
if (state)
return state;
const globals = await getGlobals(ctx);
const console = createLogger(globals.logLevel);
console.error("No internalState in running loop! Re-creating empty one...");
return (await ctx.db.get(await ctx.db.insert("internalState", INITIAL_STATE)));
}
async function getOrCreateRunningStatus(ctx) {
const runStatus = await ctx.db.query("runStatus").unique();
if (runStatus)
return runStatus;
const globals = await getGlobals(ctx);
const console = createLogger(globals.logLevel);
console.error("No runStatus in running loop! Re-creating one...");
return (await ctx.db.get(await ctx.db.insert("runStatus", { state: { kind: "running" } })));
}
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const console = "THIS IS A REMINDER TO USE createLogger";
//# sourceMappingURL=loop.js.map