lynkr
Version:
Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.
191 lines (172 loc) • 5.84 kB
JavaScript
/**
* Rolling Latency Tracker (per provider:model)
*
* Tracks latency keyed by `${provider}:${model}` so models within a provider
* (Opus vs Haiku) get separate stats. Backward-compatible: callers that pass
* only a provider still work — they're tracked under `${provider}:*`.
*
* Phase 1.5 of the routing overhaul: previous version keyed by provider only.
*
* @module routing/latency-tracker
*/
const logger = require("../logger");
const BUFFER_SIZE = 200;
const MIN_SAMPLES = 10;
/** Wildcard model used when caller doesn't specify one. */
const ANY_MODEL = '*';
function _key(provider, model) {
return `${provider}:${model || ANY_MODEL}`;
}
class LatencyTracker {
constructor() {
/** @type {Map<string, { buffer: number[], index: number, count: number, lastUpdated: number, provider: string, model: string }>} */
this._entries = new Map();
}
/**
* Record a latency measurement.
*
* Signatures:
* record(provider, latencyMs) // legacy
* record(provider, model, latencyMs) // preferred
*/
record(provider, modelOrLatency, maybeLatency) {
let model;
let latencyMs;
if (typeof modelOrLatency === 'number') {
model = ANY_MODEL;
latencyMs = modelOrLatency;
} else {
model = modelOrLatency || ANY_MODEL;
latencyMs = maybeLatency;
}
if (!provider || typeof latencyMs !== "number" || latencyMs < 0) return;
const k = _key(provider, model);
let entry = this._entries.get(k);
if (!entry) {
entry = {
buffer: new Array(BUFFER_SIZE).fill(0),
index: 0,
count: 0,
lastUpdated: 0,
provider,
model,
};
this._entries.set(k, entry);
}
entry.buffer[entry.index] = latencyMs;
entry.index = (entry.index + 1) % BUFFER_SIZE;
entry.count += 1;
entry.lastUpdated = Date.now();
}
_computeStats(entry) {
if (!entry || entry.count === 0) return null;
const sampleCount = Math.min(entry.count, BUFFER_SIZE);
const samples = entry.buffer.slice(0, sampleCount);
const sorted = samples.slice().sort((a, b) => a - b);
const sum = sorted.reduce((acc, v) => acc + v, 0);
return {
p50: sorted[Math.floor(sampleCount * 0.5)],
p95: sorted[Math.floor(sampleCount * 0.95)],
p99: sorted[Math.floor(sampleCount * 0.99)],
avg: Math.round(sum / sampleCount),
count: entry.count,
lastUpdated: entry.lastUpdated,
provider: entry.provider,
model: entry.model,
};
}
/**
* Get stats for a specific (provider, model) pair, or aggregated for a provider
* if model is omitted.
*/
getStats(provider, model = null) {
if (model) {
return this._computeStats(this._entries.get(_key(provider, model)));
}
// Aggregate across all models for this provider
const provEntries = [];
for (const [k, entry] of this._entries) {
if (entry.provider === provider) provEntries.push(entry);
}
if (provEntries.length === 0) return null;
if (provEntries.length === 1) return this._computeStats(provEntries[0]);
// Pool samples across model entries to compute combined percentiles
const pooled = [];
let total = 0;
let lastUpdated = 0;
for (const e of provEntries) {
const n = Math.min(e.count, BUFFER_SIZE);
for (let i = 0; i < n; i++) pooled.push(e.buffer[i]);
total += e.count;
if (e.lastUpdated > lastUpdated) lastUpdated = e.lastUpdated;
}
if (pooled.length === 0) return null;
pooled.sort((a, b) => a - b);
const sum = pooled.reduce((acc, v) => acc + v, 0);
return {
p50: pooled[Math.floor(pooled.length * 0.5)],
p95: pooled[Math.floor(pooled.length * 0.95)],
p99: pooled[Math.floor(pooled.length * 0.99)],
avg: Math.round(sum / pooled.length),
count: total,
lastUpdated,
provider,
model: ANY_MODEL,
};
}
/** Latency penalty/bonus used by complexity-analyzer. */
penalizeScore(provider, model = null) {
const stats = this.getStats(provider, model);
if (!stats || stats.count < MIN_SAMPLES) return 0;
if (stats.p95 > 10000) return 10;
if (stats.p95 > 5000) return 5;
if (stats.p50 < 1000) return -5;
return 0;
}
/**
* Phase 1.5: per-model P95 lookup for deadline-aware routing (Phase 6.3).
* Returns null if insufficient samples.
*/
getModelP95(provider, model) {
const stats = this.getStats(provider, model);
if (!stats || stats.count < MIN_SAMPLES) return null;
return stats.p95;
}
/**
* Whether a model is currently degraded (P95 > 2x its historical median).
* Currently uses a simple absolute threshold — better signal will come in
* Phase 4.3 (drift detection).
*/
isDegraded(provider, model) {
const stats = this.getStats(provider, model);
if (!stats || stats.count < MIN_SAMPLES) return false;
return stats.p95 > stats.p50 * 2 && stats.p95 > 5000;
}
/**
* Get stats for every tracked entry.
*
* Backward-compat: when an entry was recorded via the legacy 2-arg
* `record(provider, latency)` signature, the model is the wildcard `*`
* and we return it keyed by provider name only. Entries with explicit
* models use the `provider:model` key.
*/
getAllStats() {
const result = new Map();
for (const [k, entry] of this._entries) {
const stats = this._computeStats(entry);
if (!stats) continue;
const outKey = entry.model === ANY_MODEL ? entry.provider : k;
result.set(outKey, stats);
}
return result;
}
}
let instance = null;
function getLatencyTracker() {
if (!instance) {
instance = new LatencyTracker();
logger.debug("LatencyTracker initialised");
}
return instance;
}
module.exports = { LatencyTracker, getLatencyTracker, ANY_MODEL };