lynkr
Version:
Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.
204 lines (175 loc) • 5.9 kB
JavaScript
const os = require("os");
const logger = require("../../logger");
const { ServiceUnavailableError } = require("./error-handling");
/**
* Load shedding middleware
*
* Features:
* - Detect system overload (CPU, memory, queue depth)
* - Reject requests with 503 when overloaded
* - Protect system from cascading failures
* - Minimal performance overhead
*/
class LoadShedder {
constructor(options = {}) {
// Thresholds
this.memoryThreshold = options.memoryThreshold || 0.85; // 85%
this.heapThreshold = options.heapThreshold || 0.95; // 95% (increased from 90% to prevent false positives from temporary allocation spikes)
this.activeRequestsThreshold = options.activeRequestsThreshold || 1000;
// State
this.activeRequests = 0;
this.totalShed = 0;
this.lastCheck = Date.now();
this.checkInterval = options.checkInterval || 1000; // Check every second
this.cachedOverloadState = false;
}
/**
* Check if system is overloaded
*/
isOverloaded() {
const now = Date.now();
// Use cached state if checked recently (performance optimization)
if (now - this.lastCheck < this.checkInterval) {
return this.cachedOverloadState;
}
this.lastCheck = now;
// Check memory usage
const memUsage = process.memoryUsage();
const heapUsedPercent = memUsage.heapUsed / memUsage.heapTotal;
// FIX: Only trigger if BOTH percentage is high AND actual usage is significant
// This prevents false positives on startup when heapTotal is small but will grow
const heapUsedMB = memUsage.heapUsed / (1024 * 1024);
const minHeapThresholdMB = 500; // Only shed load if using more than 500MB
if (heapUsedPercent > this.heapThreshold && heapUsedMB > minHeapThresholdMB) {
logger.warn(
{
heapUsedPercent: (heapUsedPercent * 100).toFixed(2),
heapUsedMB: heapUsedMB.toFixed(2),
threshold: (this.heapThreshold * 100).toFixed(2),
minThresholdMB: minHeapThresholdMB,
},
"Load shedding: Heap usage exceeded threshold"
);
this.cachedOverloadState = true;
return true;
}
// Check RSS / system memory
const rssPercent = memUsage.rss / os.totalmem();
if (rssPercent > this.memoryThreshold) {
logger.warn(
{
rssPercent: (rssPercent * 100).toFixed(2),
threshold: (this.memoryThreshold * 100).toFixed(2),
},
"Load shedding: RSS memory usage exceeded threshold"
);
this.cachedOverloadState = true;
return true;
}
// Check active requests
if (this.activeRequests > this.activeRequestsThreshold) {
logger.warn(
{
activeRequests: this.activeRequests,
threshold: this.activeRequestsThreshold,
},
"Load shedding: Active requests exceeded threshold"
);
this.cachedOverloadState = true;
return true;
}
this.cachedOverloadState = false;
return false;
}
/**
* Get current metrics
*/
getMetrics() {
const memUsage = process.memoryUsage();
return {
activeRequests: this.activeRequests,
totalShed: this.totalShed,
heapUsedPercent: ((memUsage.heapUsed / memUsage.heapTotal) * 100).toFixed(2),
heapUsedMB: (memUsage.heapUsed / (1024 * 1024)).toFixed(2),
heapTotalMB: (memUsage.heapTotal / (1024 * 1024)).toFixed(2),
rssMB: (memUsage.rss / (1024 * 1024)).toFixed(2),
rssPercent: ((memUsage.rss / os.totalmem()) * 100).toFixed(2),
thresholds: {
heapThreshold: (this.heapThreshold * 100).toFixed(2),
memoryThreshold: (this.memoryThreshold * 100).toFixed(2),
activeRequestsThreshold: this.activeRequestsThreshold,
},
};
}
}
// Singleton instance
let instance = null;
function getLoadShedder(options) {
if (!instance) {
// Read from environment variables if not provided
const defaultOptions = {
heapThreshold: Number.parseFloat(process.env.LOAD_SHEDDING_HEAP_THRESHOLD || "0.95"),
memoryThreshold: Number.parseFloat(process.env.LOAD_SHEDDING_MEMORY_THRESHOLD || "0.85"),
activeRequestsThreshold: Number.parseInt(
process.env.LOAD_SHEDDING_ACTIVE_REQUESTS_THRESHOLD || "1000",
10
),
};
instance = new LoadShedder({ ...defaultOptions, ...options });
}
return instance;
}
/**
* Initialize load shedder and log configuration
* Call this at server startup to ensure configuration is logged
*/
function initializeLoadShedder(options) {
const shedder = getLoadShedder(options);
// Log configuration
logger.info({
enabled: true,
thresholds: {
heapThreshold: (shedder.heapThreshold * 100).toFixed(2),
memoryThreshold: (shedder.memoryThreshold * 100).toFixed(2),
activeRequestsThreshold: shedder.activeRequestsThreshold,
}
}, "Load shedding initialized");
return shedder;
}
/**
* Load shedding middleware
*/
function loadSheddingMiddleware(req, res, next) {
const shedder = getLoadShedder();
// Check if overloaded
if (shedder.isOverloaded()) {
shedder.totalShed++;
// Return 503 Service Unavailable
const error = new ServiceUnavailableError(
"Service temporarily overloaded. Please retry after a few seconds."
);
// Add Retry-After header (suggest 5 seconds)
res.setHeader("Retry-After", "5");
return next(error);
}
// Track active request
shedder.activeRequests++;
// Use flag to prevent double-decrement race condition
let decremented = false;
const decrementOnce = () => {
if (!decremented) {
decremented = true;
shedder.activeRequests--;
}
};
// Both events might fire, but only decrement once
res.on("finish", decrementOnce);
res.on("close", decrementOnce);
next();
}
module.exports = {
LoadShedder,
getLoadShedder,
initializeLoadShedder,
loadSheddingMiddleware,
};