UNPKG

firelease

Version:

Firebase queue consumer for Node with at-least-once semantics

614 lines (566 loc) 24.7 kB
'use strict'; const _ = require('lodash'); const ms = require('ms'); const timers = require('safe-timers'); const PING_INTERVAL = ms('1m'); const PING_KEY = 'ping'; const queues = []; const tasks = {}; const blacklistedTaskKeys = new Set(); let globalMaxConcurrent = Number.MAX_VALUE; let globalNumConcurrent = 0; let shutdownResolve, shutdownReject, shutdownPromise; const scanAll = _.debounce(() => { _.forEach(tasks, task => { task.queue.process(task); }); }, 100); module.exports = {}; /** * Return this from a worker to retry after the current lease expires, and to reset the lease * backoff to zero. */ module.exports.RETRY = {}; /** * Set this to the maximum number of concurrent tasks being executed at any moment across all * queues. * @type {number} */ Object.defineProperty(module.exports, 'globalMaxConcurrent', { get: () => {return globalMaxConcurrent;}, set: value => { globalMaxConcurrent = value; if (value) { if (shutdownReject) shutdownReject(new Error('Queues restarted')); shutdownPromise = shutdownResolve = shutdownReject = null; scanAll(); } } }); /** * Default option values for all subsequent attachWorker calls. See that function for details. * @type {Object} */ module.exports.defaults = { maxConcurrent: Number.MAX_VALUE, bufferSize: 5, minLease: '30s', maxLease: '1h', leaseDelay: 0, maxLeaseDelay: 0, healthyPingLatency: '1.5s' }; /** * A function used to capture errors. Defaults to logging the stack to the console, but you may * want to change it to something else in production. The function should take a single exception * argument. */ module.exports.captureError = error => {console.log(error.stack);}; class Task { constructor(queue, snap) { this.queue = queue; this.ref = snap.ref; this.key = Task.makeKey(snap); this.phase = 'wait'; this.updateFrom(snap); } static makeKey(snap) { return snap.ref.toString(); } updateFrom(snap) { const value = snap.val(); this.expiry = value && value._lease && value._lease.expiry || this.queue.now; // console.log('update', this.key, 'expiry', this.expiry); delete this.removed; } prepare() { if (tasks[this.key] !== this || this.removed || this.working) return false; const now = this.queue.now; const busy = this.expiry + this.queue.leaseDelay > now; // console.log('prepare', this.ref.key, 'expiry', this.expiry, 'now', now); if (!busy) { // Locally reserve for min lease duration to prevent concurrent transaction attempts. Expiry // will be overwritten when transaction completes or task gets removed. this.expiry = now + this.queue.constrainLeaseDuration(0); } if (this.timeout) this.timeout.clear(); this.timeout = timers.setTimeout( this.queue.process.bind(this.queue, this), this.expiry + this.queue.leaseDelay - now); return !busy; } process() { let startTimestamp; let acquired; this.working = true; this.phase = 'lease'; const transactionPromise = this.ref.transaction(item => { acquired = false; if (tasks[this.key] !== this || this.removed) return; if (!item || this.ref.key === PING_KEY) { acquired = true; return null; } startTimestamp = this.queue.now; // console.log('txn ', this.ref.key, 'lease', item._lease, 'now', startTimestamp); // Check if another process beat us to it. if (item._lease && item._lease.expiry && item._lease.expiry + this.queue.leaseDelay > startTimestamp) { return item; } acquired = true; item._lease = item._lease || {}; item._lease.time = this.queue.constrainLeaseDuration(item._lease.time * 2 || 0); item._lease.expiry = startTimestamp + item._lease.time; item._lease.attempts = (item._lease.attempts || 0) + 1; if (!item._lease.initial) item._lease.initial = startTimestamp; item._lease.busy = true; return this.queue.callPreprocess(item); }, {detectStuck: 5, prefetchValue: false, timeout: ms('15s')}); return transactionPromise.then(item => { if (!acquired) this.queue.countTaskAcquired(false); if (!acquired || item === null || this.ref.key === PING_KEY) return; if (!_.isObject(item)) throw new Error(`item not an object: ${item}`); Object.defineProperty(item, '$leaseTransaction', {value: transactionPromise.transaction}); this.queue.countTaskAcquired(true); return this.run(item, startTimestamp); }).catch(error => { console.log(`Queue item ${this.key} lease transaction error: ${error.message}`); error.firelease = _.assign(error.firelease || {}, {itemKey: this.key, phase: 'leasing'}); module.exports.captureError(error); // Hardcoded retry -- hard to do anything smarter, since we failed to update the task in // Firebase. this.expiry = 0; timers.setTimeout(this.queue.scan, ms('3s')); }).then(() => { this.working = false; this.phase = this.removed ? 'done' : 'retry'; }); } run(item, startTimestamp) { Object.defineProperty(item, '$ref', {value: this.ref}); Object.defineProperty(item, '$leaseTimeRemaining', {get: () => { if (!(item._lease && item._lease.expiry)) return 0; return Math.max(0, item._lease.expiry - this.queue.now); }}); this.phase = 'work'; return this.queue.callWorker(item).finally(() => { const now = this.queue.now; if (now > item._lease.expiry) { this.phase = 'exceed'; // If it looks like we exceeded the lease time, double-check against the current item before // crying wolf, in case the worker extended the lease. // eslint-disable-next-line no-shadow return this.ref.get({cache: false}).then(item => { // If no item, we can't tell if it's because the worker chose to delete it early, or // because it overran its lease and another worker picked it up and completed it, so say // nothing. if (!item) return; if (!item._lease) { console.log( `Queue item ${this.key} likely exceeded its lease time by taking`, ms(now - startTimestamp), 'because the item has already been deleted and replaced with a new one.'); } else if (now > item._lease.expiry) { console.log( `Queue item ${this.key} exceeded lease time of`, ms(item._lease.expiry - startTimestamp), 'by taking', ms(now - startTimestamp)); } }); } }).then(result => { this.phase = 'post'; if (_.isNil(result)) return this.ref.remove(); // common shortcut return this.ref.transaction(item2 => { if (!item2) return null; let value = _.isFunction(result) ? result(item2) : result; if (_.isNil(value)) return null; if (value === module.exports.RETRY) { if (item2._lease) delete item2._lease.time; } else if (_.isNumber(value) || _.isString(value)) { value = duration(value); item2._lease = item2._lease || {}; item2._lease.expiry = value > 1000000000000 ? value : startTimestamp + value; delete item2._lease.time; } else if (_.isObject(value)) { item2._lease = value; } else { throw new Error(`Unexpected return value from worker: ${value}`); } if (item2._lease) delete item2._lease.busy; return item2; }, {prefetchValue: false}).then(item2 => { if (item2) item._lease = item2._lease; }); }, error => { console.log(`Queue item ${this.key} processing error: ${error.message}`); error.firelease = _.assign(error.firelease || {}, {itemKey: this.key, phase: 'processing'}); if (!error.level) error.level = 'warning'; module.exports.captureError(error); // Reset busy flag, unless we exceeded our original lease in which case we can't be sure // whether another handler has already picked up the task so leave it be. if (this.phase !== 'exceed') return this.ref.child('_lease/busy').set(false); }).catch(error => { console.log(`Queue item ${this.key} post-processing error: ${error.message}`); error.firelease = _.assign(error.firelease || {}, {itemKey: this.key, phase: 'post-processing'}); module.exports.captureError(error); }); } } class Queue { constructor(ref, options, worker) { if (_.isFunction(options)) { worker = options; options = {}; } this.options = _.defaults({}, options, module.exports.defaults); this.options.minLease = duration(this.options.minLease); this.options.maxLease = duration(this.options.maxLease); this.options.minLeaseDelay = this.leaseDelay = duration(this.options.leaseDelay); delete this.options.leaseDelay; this.options.healthyPingLatency = duration(this.options.healthyPingLatency); this.options.maxLeaseDelay = duration(this.options.maxLeaseDelay); this.numConcurrent = 0; this.tasksAcquired = 0; this.worker = worker; this.ref = ref; // Need each queue's scan function to be debounced separately. this.scan = _.debounce(this.scan.bind(this), 100); const bufferAll = this.options.bufferSize === Infinity; const topRef = bufferAll ? ref : ref.orderByChild('_lease/expiry').limitToFirst(this.options.bufferSize); topRef.on('child_added', this.addTask, this.crash, this); topRef.on('child_removed', this.removeTask, this.crash, this); topRef.on(bufferAll ? 'child_changed' : 'child_moved', this.addTask, this.crash, this); } scan() { _.forEach(tasks, task => { if (task.queue === this) task.queue.process(task); }); } crash(error) { console.log(`Queue worker ${this.ref.toString()} interrupted:`, error); error.firelease = _.assign(error.firelease || {}, {queue: this.ref.toString(), phase: 'crashing'}); module.exports.captureError(error); process.exit(1); } get now() { return this.ref.now; } addTask(snap) { const taskKey = Task.makeKey(snap); let task = tasks[taskKey]; if (blacklistedTaskKeys.has(taskKey)) { if (task) this.removeTask(taskKey); return; } if (task) { task.updateFrom(snap); } else { task = tasks[taskKey] = new Task(this, snap); } this.process(task); } removeTask(snapOrKey) { const taskKey = typeof snapOrKey === 'string' ? snapOrKey : Task.makeKey(snapOrKey); const task = tasks[taskKey]; if (!task) return; task.removed = true; if (task.timeout) { task.timeout.clear(); delete task.timeout; } if (!task.working) delete tasks[taskKey]; } hasQuota() { return this.numConcurrent < this.options.maxConcurrent && globalNumConcurrent < globalMaxConcurrent; } constrainLeaseDuration(time) { return Math.min(this.options.maxLease, Math.max(time, this.options.minLease)); } countTaskAcquired(acquired) { if (this.options.maxLeaseDelay) { this.leaseDelay = Math.max(this.options.minLeaseDelay, Math.min( this.options.maxLeaseDelay, this.leaseDelay + (acquired ? 1 : -2))); } if (acquired) this.tasksAcquired++; } process(task) { if (this.hasQuota() && task.prepare()) { globalNumConcurrent++; this.numConcurrent++; task.process().then(() => { globalNumConcurrent--; this.numConcurrent--; if (task.removed) delete tasks[task.key]; if (globalNumConcurrent === globalMaxConcurrent - 1) { scanAll(); } else if (this.numConcurrent === this.options.maxConcurrent - 1) { this.scan(); } if (shutdownResolve && !globalMaxConcurrent && !globalNumConcurrent) shutdownResolve(); if (!globalMaxConcurrent) { console.log(`Queues draining, tasks in progress: ${globalNumConcurrent}`); } }).catch(error => { error.message = `Unexpected error in Queue.process: ${error.message}`; module.exports.captureError(error); }); } } callPreprocess(item) { if (this.options.preprocess) item = this.options.preprocess(item); return item; } callWorker(item) { try { const result = this.worker(item); if (result && typeof result.next === 'function' && typeof result.throw === 'function' && Promise.co) { // Got a generator, let's co-ify it nicely to capture errors. return Promise.co(result); } return Promise.resolve(result); } catch (e) { return Promise.reject(e); } } } /** * Attaches a worker function to consume tasks from a queue. You should normally attach no more * than one worker per path in any given process, but it's OK to run multiple processes on the same * paths concurrently. If you do, you probably want to set `maxLeaseDelay` to something greater * than zero, to properly balance task distribution between the processes. * * All durations can be specified as either a human-readable string, or a number of milliseconds. * * @param {NodeFire} ref A NodeFire ref to the queue root in Firebase. Individual tasks will be * children of this root and must be objects. The '_lease' key is reserved for use by * Firelease in each task. * @param {Object} options Optional options, supporting the following values: * maxConcurrent: {number} max number of tasks to handle concurrently for this worker. * bufferSize: {number} upper bound on how many tasks to keep buffered and potentially go * through leasing transactions in parallel. In principle, it's not worth setting higher * than `maxConcurrent`, but you can set it to `Infinity` to keep the entire task queue * buffered at all times if needed. * minLease: {number | string} minimum duration of each lease, which should equal the maximum * expected time a worker will take to handle a task. * maxLease: {number | string} maximum duration of each lease; the lease duration is doubled * each time a task fails until it reaches maxLease. * leaseDelay: {number | string} duration by which to delay leasing an item after it becomes * available; useful for setting up "backup" servers that only grab tasks that aren't taken * up fast enough by the primary. * maxLeaseDelay: {number | string} if non-zero, enables automatic leaseDelay adjustment and * sets the maximum duration to wait before attempting to acquire a ready task. This is * often necessary to compensate for differences in machine or network speed, or for * Firebase's consistent order for sending event notifications to multiple clients. * preprocess: {function(Object):Object} a function to use to preprocess each item during the * leasing transaction. This function must be fast, synchronous, idempotent, and * should return the modified item (passed as the sole argument, OK to mutate). One use * for preprocessing is to clean up items written to a queue by a process outside your * control (e.g., webhooks). * healthyPingLatency: {number | string} the maximum response latency to pings that is * considered "healthy" for this queue. * @param {function(Object):RETRY | number | string | undefined} worker The worker function that * handles enqueued tasks. It will be given a task object as argument, with a special $ref * attribute set to the Nodefire ref of that task. The worker can perform arbitrary * computation whose duration should not exceed the queue's minLease value. It can * manipulate the task itself in Firebase as well, e.g. to delete it (to get at-most-once * queue semantics) or otherwise modify it. The worker can return any of the following: * * undefined or null to cause the task to be retired from the queue. * * firelease.RETRY to cause the task to be retried after the current lease expires (and * reset the lease backoff counter). * * A duration after which the task should be retried relative to when it was started. * * An epoch in milliseconds greater than 1000000000000 at which the task should be tried. * * A complete _lease object, to be saved as-is. * * A function that takes the task as argument and returns one of the values above. This * function will be executed in a transaction to ensure atomicity. * All of these values can also be wrapped in a promise or a generator, which will be dealt * with appropriately. */ module.exports.attachWorker = function(ref, options, worker) { queues.push(new Queue(ref, options, worker)); }; function duration(value) { if (_.isNumber(value)) return value; return ms(value); } let pinging = false; let pingIntervalHandle, pingCallback; /** * Sets up regular pinging of all queues. Can be called either before or after workers are * attached, and will always ping all queues. Can be called more than once to change the * parameters. * * All durations can be specified as either a human-readable string, or a number of milliseconds. * * @param {Function(Object) | null} callback The callback to invoke with a report each time we ping * all the queues. The report looks like: {healthy: true, maxLatency: 1234}. If not * specified, reports are silently dropped. * @param {number | string} interval The interval at which to ping queues, to both check the * current response latency and make sure no tasks are stuck. Defaults to 1 minute. */ module.exports.pingQueues = function(callback, interval) { interval = interval && duration(interval) || PING_INTERVAL; if (pingIntervalHandle) pingIntervalHandle.clear(); pingCallback = callback; pingIntervalHandle = timers.setInterval(() => { checkPings().catch(error => { error.firelease = _.assign(error.firelease || {}, {phase: 'pinging'}); error.level = 'warning'; module.exports.captureError(error); pinging = false; }); }, interval); }; function checkPings() { if (pinging) return Promise.resolve(); pinging = true; return Promise.all(_.map(queues, queue => { const start = Date.now(); const pingRef = queue.ref.child(PING_KEY); let pingFree; return pingRef.transaction(item => { pingFree = !item; return item || {timestamp: start, _lease: {expiry: 1}}; }, {prefetchValue: false, timeout: ms('10s')}).then(item => { if (!pingFree) return null; // another process is currently pinging return waitUntilDeleted(pingRef, queue.options.healthyPingLatency + ms('10s')).then(() => { const latency = Date.now() - start; return { queue, latency, healthy: latency < queue.options.healthyPingLatency, leaseDelay: queue.leaseDelay, tasksAcquired: queue.tasksAcquired }; }, () => null); }); })).then(results => { results = _.compact(results); if (results.length) { // Backup scan in case tasks are stuck on a queue due to bugs. scanAll(); if (pingCallback) { const sickQueueKeys = _(results).reject('healthy').map(item => item.queue.ref.key).value(); const delays = _(results).map('leaseDelay').sortBy().value(); const delaysMedian = delays.length % 2 ? delays[Math.floor(delays.length / 2)] : ((delays[Math.floor(delays.length / 2)] + delays[Math.ceil(delays.length / 2)]) / 2); pingCallback({ healthy: _.every(results, 'healthy'), sickQueues: sickQueueKeys, stuckTasks: blacklistedTaskKeys.size, maxLatency: _.max(_.map(results, 'latency')), tasksAcquired: _.reduce(results, (sum, result) => sum + result.tasksAcquired, 0), leaseDelays: {min: _.min(delays), max: _.max(delays), median: delaysMedian} }); } } pinging = false; }); } function waitUntilDeleted(ref, timeout) { return new Promise((resolve, reject) => { function onValue(snap) { if (snap.val()) return; ref.off('value', onValue); resolve(); } ref.on('value', onValue, reject); if (timeout) timers.setTimeout(() => {reject(new Error('timeout'));}, timeout); }); } /** * Extends the lease on a task to give the worker more time to finish. Checks a bunch of validity * constraints along the way and throws an error if the worker needs to abort. * * All durations can be specified as either a human-readable string, or a number of milliseconds. * * @param {Object} item The original task object provided to a worker function. * @param {number | string} timeNeeded The minimum time needed counting from the current time. The actual lease may be extended by up to twice this amount, to prevent excessive churn. * @return {Promise} A promise that will be resolved when the lease has been extended, and rejected * if something went wrong and the worker should abort. */ module.exports.extendLease = function(item, timeNeeded) { if (!(item && item._lease && item._lease.expiry)) throw new Error('Invalid task'); item._lease.timeNeeded = Math.max(item._lease.timeNeeded || 0, duration(timeNeeded)); if (!item._lease.extendLeasePromise) { if (!globalMaxConcurrent) return Promise.reject(new Error('shutdown in progress')); let error, timeNeededUsed; item._lease.extendLeasePromise = item.$ref.transaction(item2 => { error = null; timeNeededUsed = null; const now = item.$ref.now; if (!item2) { error = new Error('Task disappeared, unable to extend lease.'); error.firelease = {code: 'gone'}; item2 = null; // make sure we attempt a write to force sha check } else if (!item2._lease) { error = new Error('Task recreated, unable to extend lease.'); error.firelease = {code: 'recreated'}; } else if (item._lease.expiry !== item2._lease.expiry) { error = new Error('Task leased by another worker, unable to extend lease.'); error.firelease = {code: 'stolen'}; } else if (item2._lease.expiry <= now) { error = new Error('Lease expired, unable to extend.'); error.firelease = {code: 'lost'}; } else { timeNeededUsed = item._lease.timeNeeded; // Expiry is monotonically increasing, so safe to do early abort if it's high enough. if (item2._lease.expiry >= now + timeNeededUsed) return; item2._lease.expiry += timeNeededUsed; } return item2; }, {prefetchValue: false}).then(item2 => { let moreTimeNeeded; if (item._lease) { if (item._lease.timeNeeded > timeNeededUsed) moreTimeNeeded = item._lease.timeNeeded; delete item._lease.extendLeasePromise; delete item._lease.timeNeeded; } if (error) { error.firelease = _.assign( error.firelease || {}, {itemKey: item.$ref.toString(), timeNeeded}); return Promise.reject(error); } if (item2 && item._lease) item._lease.expiry = item2._lease.expiry; if (moreTimeNeeded) { // If an extendLease raced with the transaction then retry it. return module.exports.extendLease(item, moreTimeNeeded); } }); } return item._lease.extendLeasePromise; }; /** * Blacklist the given task key from ever being processed again. * @param {string} taskKey The task key to blacklist. This is the full Firebase URL of the task and * can be obtained from an error using `error.firelease.itemKey`. * @return {boolean} True if the task key was added to the list, false if it was already present. */ module.exports.blacklist = function(taskKey) { if (blacklistedTaskKeys.has(taskKey)) return false; blacklistedTaskKeys.add(taskKey); const task = tasks[taskKey]; if (task) task.queue.removeTask(taskKey); return true; }; /** * Shuts down firelease by refusing to take new tasks. * @return {Promise<void>} A promise that resolves when the shutdown is complete. */ module.exports.shutdown = function() { globalMaxConcurrent = 0; if (!shutdownPromise) { shutdownPromise = new Promise((resolve, reject) => { shutdownResolve = resolve; shutdownReject = reject; }); } if (!globalNumConcurrent) shutdownResolve(); return shutdownPromise; }; /** * Lists the URLs of all tasks that are currently being worked on. */ module.exports.listTasksInProgress = function() { return _(tasks).map((task, key) => task.working ? key : null).compact().value(); };