UNPKG

@bennadel/circuit-breaker

Version:

A flexible circuit breaker for Node.js (requires ES6 class modules).

510 lines (328 loc) 14.1 kB
// Require the application modules. var Metrics = require( "../metrics/Metrics" ); var SharedMonitor = require( "../monitor/SharedMonitor" ); var StateError = require( "../error/StateError" ); // ----------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------- // var THREE_SECONDS = ( 3 * 1000 ); var FIFTY_PERCENT = 50; function IS_FAILURE( error ) { return( true ); } // I provide an implementation of the State logic that dictates the control-flow of a // circuit breaker. class State { // I initialize the state with the given settings: // -- // * id - The unique identifier of this state instance (used for logging). // * requestTimeout - The time a pending request is allowed to hang before being timed-out. // * volumeThreshold - The number of requests that have to be executed (in the window) before failure percentages are calculated. // * failureThreshold - The percentage of failures that can occur in the window before the state switches to open. // * activeThreshold - The number of concurrent requests that can hang before the state switches to open. // * isFailure() - The function that determines if the given failure is a true error (or should be classified as a success). // * monitor - The Monitor instance for external logging. // * metrics - The Metrics instance for tracking activity. // -- constructor( settings = {} ) { this._id = ( settings.id || "Default State Identifier" ); this._requestTimeout = ( settings.requestTimeout || THREE_SECONDS ); this._volumeThreshold = ( settings.volumeThreshold || 20 ); this._failureThreshold = ( settings.failureThreshold || FIFTY_PERCENT ); this._activeThreshold = ( settings.activeThreshold || 50 ); this._isFailure = ( settings.isFailure || IS_FAILURE ); this._monitor = ( settings.monitor || SharedMonitor ); this._metrics = ( settings.metrics || new Metrics() ); // I determine if the circuit is closed. this._closed = true; // I determine if the circuit is being held open for reasons of failure-based // health problems. this._healing = false; // I keep track of the pending executions. In addition to tracking errors, this // state implementation also tracks concurrent requests. If the number of // concurrent requests exceeds the given threshold (activeThreshold), the circuit // will open until some of the pending requests have completed. This is // independent from the rolling metrics window. this._activeRequestCount = 0; // I keep track of the pending failed count just to ensure proper state. this._activeFallbackCount = 0; } // --- // PUBLIC METHODS. // --- // I determine if the OPENED circuit can perform a health check (ie, allow a new // request to pass through the circuit execution despite its unhealthy state). canPerformHealthCheck() { if ( this.isClosed() ) { return( false ); } return( ! ( this._isOverCapacity() || this._isTakingTimeToHeal() ) ); } // I get a snapshot of the current state (primarily for logging). getSnapshot() { return({ id: this._id, closed: this.isClosed(), settings: { requestTimeout: this._requestTimeout, volumeThreshold: this._volumeThreshold, failureThreshold: this._failureThreshold, activeThreshold: this._activeThreshold }, metrics: { emit: this._metrics.get( "emit" ), execute: this._metrics.get( "execute" ), success: this._metrics.get( "success" ), failure: this._metrics.get( "failure" ), timeout: this._metrics.get( "timeout" ) }, totalMetrics: { emit: this._metrics.getTotal( "emit" ), execute: this._metrics.getTotal( "execute" ), success: this._metrics.getTotal( "success" ), failure: this._metrics.getTotal( "failure" ), timeout: this._metrics.getTotal( "timeout" ) }, current: { activeRequestCount: this._activeRequestCount } }); } // I determine if the circuit is opened (and unable to accept requests). isOpened() { return( ! this._closed ); } // I determine if the circuit is closed (and able to accept requests). isClosed() { return( this._closed ); } // I get the duration (in milliseconds) that a pending execution is allowed to hang // before it is forced into a rejected state. getTimeout() { return( this._requestTimeout ); } // I track requests that will be executed (ie, not short-circuited). trackExecute() { if ( this.isOpened() ) { if ( this._isOverCapacity() ) { throw( new StateError( "You cannot execute while the circuit is over capacity." ) ); } if ( this._isTakingTimeToHeal() ) { throw( new StateError( "You cannot execute while the circuit is taking time to heal." ) ); } } // NOTE: We are not incrementing the active request count because it is already // being incremented by the "emit" event, immediately preceding this event. this._metrics.increment( "execute" ); this._applyUpdates(); this._monitor.logExecute( this.getSnapshot() ); } // I track new requests being routed through the circuit breaker (though execution of // the underlying command is not yet guaranteed). trackEmit() { this._activeRequestCount++; this._metrics.increment( "emit" ); this._applyUpdates(); this._monitor.logEmit( this.getSnapshot() ); } // I track requests that have failed to execute in the circuit breaker due to non- // circuit breaker logic (ie, this does not include Timeout or Open errors). trackFailure( duration, error ) { // Not all errors actually indicate an unhealthy resource. For example, a // "Not Found" error, returned from an API, relates only to the content of the // request and not to the actual health of the API. As such, some errors should // be classified as a "success" metric. if ( ! this._isFailure( error ) ) { return( this.trackSuccess( duration ) ); } this._assertActiveRequestCount(); this._activeRequestCount--; this._metrics.increment( "failure" ); this._applyUpdates(); this._monitor.logFailure( this.getSnapshot(), duration, error ); } // I track rejected requests that are proceeding to the fallback workflow. trackFallbackEmit() { this._activeFallbackCount++; this._monitor.logFallbackEmit( this.getSnapshot() ); } // I track rejected requests that have failed to resolve with an exiting fallback // value (as opposed to a "fallback missing" event, which has no fallback). trackFallbackFailure( error ) { this._assertActiveFallbackCount(); this._activeFallbackCount--; this._monitor.logFallbackFailure( this.getSnapshot(), error ); } // I track requests that have been rejected without an existing fallback value. trackFallbackMissing() { this._assertActiveFallbackCount(); this._activeFallbackCount--; this._monitor.logFallbackMissing( this.getSnapshot() ); } // I track rejected requests that have successfully resolved with a fallback value. trackFallbackSuccess() { this._assertActiveFallbackCount(); this._activeFallbackCount--; this._monitor.logFallbackSuccess( this.getSnapshot() ); } // I track requests that have been summarily rejected due to an open circuit. trackShortCircuited( error ) { this._assertActiveRequestCount(); this._activeRequestCount--; this._applyUpdates(); this._monitor.logShortCircuited( this.getSnapshot(), error ); } // I track requests that have successfully executed in the circuit breaker. trackSuccess( duration ) { this._assertActiveRequestCount(); // If the circuit is currently being held open for reasons other than capacity, // then any successful response during this time of poor health may indicate that // the underlying resource has, indeed, recovered. As such, let's reset the // metrics for the current window (which will affect the application of the // updates to the current state). // -- // NOTE: We want to reset the metrics so that a subsequent failure doesn't // immediately flip the circuit back into an opened state. We want the metrics to // have to accumulate the volume threshold once again before flipping open. if ( this.isOpened() && ! this._isOverCapacity() ) { this._metrics.reset(); // NOTE: Once this method is done executing, the only recorded metric will be // a single "success" event. } this._activeRequestCount--; this._metrics.increment( "success" ); this._applyUpdates(); this._monitor.logSuccess( this.getSnapshot(), duration ); } // I track requests that have not returned in the allotted timeout period. trackTimeout( duration, error ) { this._assertActiveRequestCount(); this._activeRequestCount--; this._metrics.increment( "timeout" ); this._applyUpdates(); this._monitor.logTimeout( this.getSnapshot(), duration, error ); } // --- // PRIVATE METHODS. // --- // I apply the recent updates to the state of the circuit, moving the circuit from // opened-to-closed or closed-to-open as necessary. _applyUpdates() { // If the circuit is CLOSED, check to see if it needs to be opened. if ( this.isClosed() ) { if ( this._isFailing() ) { this._openAndHeal(); } else if ( this._isOverCapacity() ) { this._open(); } // If the circuit is OPENED, check to see if it needs to be closed. } else { if ( ! this._isOverCapacity() && this._isHealed() ) { this._close(); } } } // I ensure that the active fallback count is positive before the calling context // attempts to decrement the count. _assertActiveFallbackCount() { if ( this._activeFallbackCount <= 0 ) { throw( new StateError( "You cannot track the end of a fallback when you have no pending fallbacks." ) ); } } // I ensure that the active request count is positive before the calling context // attempts to decrement the count. _assertActiveRequestCount() { if ( this._activeRequestCount <= 0 ) { throw( new StateError( "You cannot track the end of an execution when you have no pending executions." ) ); } } // I move the circuit to a closed state. _close() { if ( this.isClosed() ) { throw( new StateError( "State already closed." ) ); } this._closed = true; this._healing = false; this._monitor.logClosed( this.getSnapshot() ); } // I determine if the circuit is currently exceeding the failure threshold and should // be considered unhealthy. _isFailing() { var successCount = this._metrics.get( "success" ); var failureCount = this._metrics.get( "failure" ); var timeoutCount = this._metrics.get( "timeout" ); var errorCount = ( failureCount + timeoutCount ); var totalCount = ( successCount + failureCount + timeoutCount ); // If we haven't recorded enough outcomes, we don't want to let the circuit fail. // Doing so could lead to a 100% failure rate (for example) if the first request // in each bucket results in a failure. if ( totalCount < this._volumeThreshold ) { return( false ); } // CAUTION: Failure threshold is defined in whole numbers (ie, 5% not 0.05%). return( ( errorCount / totalCount * 100 ) >= this._failureThreshold ); } // I check to see if a healing circuit has finally healed. _isHealed() { // If the circuit hasn't been flagged as healing, then we don't even need to // check the metrics window. if ( ! this._healing ) { return( true ); } var executeCount = this._metrics.get( "execute" ); var successCount = this._metrics.get( "success" ); var failureCount = this._metrics.get( "failure" ); var timeoutCount = this._metrics.get( "timeout" ); var totalCount = ( executeCount + successCount + failureCount + timeoutCount ); // The circuit will be considered healed when the only inbound or outbound metric // is a single Success metric. We know this is true because a successful response // during an open circuit will reset the metrics, recording only the subsequent // success event. return( ( totalCount === 1 ) && ( successCount === 1 ) ); } // I determine if the circuit is currently over capacity for pending requests. _isOverCapacity() { return( this._activeRequestCount > this._activeThreshold ); } // I determine if the circuit is currently being held open for "healing" reasons and // still needs to still needs to wait for the current metrics window to become quiet. _isTakingTimeToHeal() { // If the circuit hasn't been flagged as healing, then we don't even need to // check the metrics window. if ( ! this._healing ) { return( false ); } var executeCount = this._metrics.get( "execute" ); var successCount = this._metrics.get( "success" ); var failureCount = this._metrics.get( "failure" ); var timeoutCount = this._metrics.get( "timeout" ); // If the current metrics window no longer has any trace of outbound or inbound // activity, it means the full metrics window has had a chance to cycle through // while opened and is now in a quiet state. Once this is done, the circuit is // no longer waiting to heal and can accept a health check. // -- // NOTE: We are not including "emit" in this since emit doesn't directly relate // to outbound traffic. return( executeCount + successCount + failureCount + timeoutCount ); } // I move the circuit to an opened state. _open() { if ( this.isOpened() ) { throw( new StateError( "Already opened." ) ); } this._closed = false; this._monitor.logOpened( this.getSnapshot() ); } // I move the circuit to an opened state and hold it open, giving the underlying // resource time to heal. _openAndHeal() { if ( this.isOpened() ) { throw( new StateError( "Already opened." ) ); } this._closed = false; this._healing = true; this._monitor.logOpened( this.getSnapshot() ); } } // ----------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------- // module.exports = State;