lynkr
Version:
Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.
762 lines (691 loc) • 26.9 kB
JavaScript
/**
* Smart Routing Module
*
* Intelligent request routing based on complexity analysis.
* Routes simple requests to local models (Ollama, llama.cpp)
* and complex requests to cloud providers.
*
* @module routing
*/
const config = require('../config');
const logger = require('../logger');
const {
analyzeComplexity,
shouldForceLocal,
shouldForceCloud,
routingMetrics,
analyzeWithEmbeddings,
} = require('./complexity-analyzer');
// Intelligent routing modules
const { getAgenticDetector, AGENT_TYPES } = require('./agentic-detector');
const { getModelTierSelector, TIER_DEFINITIONS } = require('./model-tiers');
const { getCostOptimizer } = require('./cost-optimizer');
const { analyzeRisk } = require('./risk-classifier');
// Phase 3-6 routing modules
const { getKnnRouter } = require('./knn-router');
const { getBandit } = require('./bandit');
const { getShadowPolicy, compareAndLog: shadowCompareAndLog } = require('./shadow-mode');
const { chooseFastest } = require('./deadline');
const { applyTenantOverrides } = require('./tenant-policy');
// Telemetry modules
const telemetry = require('./telemetry');
const { scoreResponseQuality } = require('./quality-scorer');
const { getLatencyTracker } = require('./latency-tracker');
// Phase 1 modules
const contextValidator = require('./context-validator');
const { countPayloadTokens } = require('./tokenizer');
// Local providers
const LOCAL_PROVIDERS = ['ollama', 'llamacpp', 'lmstudio'];
/**
* Returns true when any message content block is an image.
* Handles both string content and structured content arrays.
*/
function _payloadHasImages(payload) {
const messages = payload?.messages;
if (!Array.isArray(messages)) return false;
return messages.some(msg => {
const content = msg?.content;
if (!Array.isArray(content)) return false;
return content.some(block => block?.type === 'image' || block?.type === 'image_url');
});
}
/**
* List of providers that currently have credentials configured.
* Used by the Phase 1.2 cost-optimizer override to scope candidates.
*/
function _enabledProviders() {
const out = [];
if (config.databricks?.url && config.databricks?.apiKey) out.push('databricks');
if (config.azureAnthropic?.endpoint && config.azureAnthropic?.apiKey) out.push('azure-anthropic');
if (config.bedrock?.apiKey) out.push('bedrock');
if (config.openrouter?.apiKey) out.push('openrouter');
if (config.openai?.apiKey) out.push('openai');
if (config.azureOpenAI?.endpoint && config.azureOpenAI?.apiKey) out.push('azure-openai');
if (config.ollama?.endpoint) out.push('ollama');
if (config.llamacpp?.endpoint) out.push('llamacpp');
if (config.lmstudio?.endpoint) out.push('lmstudio');
return out;
}
/**
* Check if a provider is local
*/
function isLocalProvider(provider) {
return LOCAL_PROVIDERS.includes(provider);
}
/**
* Check if fallback is enabled
*/
function isFallbackEnabled() {
return config.modelProvider?.fallbackEnabled !== false;
}
/**
* Get the configured fallback provider
*/
function getFallbackProvider() {
return config.modelProvider?.fallbackProvider ?? 'databricks';
}
/**
* Get the best available cloud provider
* @param {Object} options - Options for provider selection
* @param {number} options.toolCount - Number of tools in the request (for hybrid routing)
* @param {boolean} options.useHybridRouting - Whether to use hybrid routing logic (default: false)
*/
function getBestCloudProvider() {
// Standard priority order for cloud providers
if (config.databricks?.url && config.databricks?.apiKey) return 'databricks';
if (config.azureAnthropic?.endpoint && config.azureAnthropic?.apiKey) return 'azure-anthropic';
if (config.bedrock?.apiKey) return 'bedrock';
if (config.openrouter?.apiKey) return 'openrouter';
if (config.openai?.apiKey) return 'openai';
if (config.azureOpenAI?.endpoint && config.azureOpenAI?.apiKey) return 'azure-openai';
return getFallbackProvider();
}
/**
* Get the best available local provider
*/
function getBestLocalProvider() {
if (config.ollama?.endpoint) return 'ollama';
if (config.llamacpp?.endpoint) return 'llamacpp';
if (config.lmstudio?.endpoint) return 'lmstudio';
return 'ollama'; // Default
}
/**
* Determine the optimal provider based on request complexity
*
* This is the main routing function that implements all 4 phases:
* - Phase 1: Basic scoring (tokens, tools, task type)
* - Phase 2: Advanced classification (code complexity, reasoning)
* - Phase 3: Metrics tracking
* - Phase 4: Optional embeddings-based adjustment
*
* @param {Object} payload - Request payload
* @param {Object} options - Routing options
* @returns {Object} Routing decision with provider and metadata
*/
const sessionAffinity = require('./session-affinity');
/**
* Provider routing with session affinity.
*
* When a conversation already carries tool history, reuse the provider the
* session first routed to so tool-call IDs don't break across providers.
* Fresh turns route normally and refresh the session's pinned provider.
*/
async function determineProviderSmart(payload, options = {}) {
const sessionId = payload?._sessionId || null;
// Enforce affinity only for in-flight tool exchanges — the turns that 400
// if the provider changes. Fresh turns keep full per-turn tier routing.
if (sessionId && !options.forceProvider && sessionAffinity.payloadHasToolHistory(payload)) {
const pinned = sessionAffinity.getPinned(sessionId);
if (pinned) {
logger.debug({ sessionId, provider: pinned.provider, tier: pinned.tier },
'[Routing] Session affinity — reusing provider for tool-bearing turn');
return {
provider: pinned.provider,
model: pinned.model,
tier: pinned.tier,
method: 'session_affinity',
reason: 'tool_history_provider_pin',
};
}
}
const decision = await _determineProviderSmartInner(payload, options);
// Remember the chosen provider so later tool-bearing turns stay consistent.
if (sessionId && decision?.provider && !options.forceProvider) {
sessionAffinity.setPinned(sessionId, decision);
}
return decision;
}
async function _determineProviderSmartInner(payload, options = {}) {
const primaryProvider = config.modelProvider?.type ?? 'databricks';
// Risk analysis runs orthogonally to complexity. We compute it once
// up-front so it can short-circuit force_local and feed the tier
// selector below. Even when tier routing is disabled we still surface
// the signal for telemetry.
let risk = null;
try {
risk = analyzeRisk(payload);
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Risk analysis failed, ignoring');
risk = null;
}
// If tier routing is disabled, use static configuration
if (!config.modelTiers?.enabled) {
return {
provider: primaryProvider,
model: null,
method: 'static',
reason: 'tier_routing_disabled',
risk,
};
}
// High-risk requests jump straight to COMPLEX and skip the rest of
// the analysis. This is independent of complexity score — a one-line
// edit to auth/middleware.ts should never go to a local model.
if (risk?.level === 'high' && isFallbackEnabled()) {
try {
const selector = getModelTierSelector();
const modelSelection = selector.selectModel('COMPLEX', null);
const decision = {
provider: modelSelection.provider,
model: modelSelection.model,
tier: 'COMPLEX',
method: 'risk',
reason: 'high_risk_forced_tier',
score: 100,
risk,
};
routingMetrics.record(decision);
logger.debug({
tier: 'COMPLEX',
provider: decision.provider,
instructionHits: risk.instructionHits,
pathHits: risk.pathHits,
}, '[Routing] High risk → forcing tier');
return decision;
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Risk-forced tier selection failed, falling through');
}
}
// Quick check for force patterns
if (shouldForceLocal(payload)) {
// When tier routing is enabled, respect TIER_SIMPLE instead of blindly choosing local
if (config.modelTiers?.enabled) {
try {
const selector = getModelTierSelector();
const modelSelection = selector.selectModel('SIMPLE', null);
const decision = {
provider: modelSelection.provider,
model: modelSelection.model,
tier: 'SIMPLE',
method: 'force',
reason: 'force_local_pattern',
score: 0,
risk,
};
routingMetrics.record(decision);
return decision;
} catch (err) {
logger.debug({ err: err.message }, 'Tier selection failed for force_local, falling back to local provider');
}
}
const provider = getBestLocalProvider();
const decision = {
provider,
model: null,
method: 'force',
reason: 'force_local_pattern',
score: 0,
risk,
};
routingMetrics.record(decision);
return decision;
}
if (shouldForceCloud(payload) && isFallbackEnabled()) {
const provider = getBestCloudProvider();
const decision = {
provider,
model: null,
method: 'force',
reason: 'force_cloud_pattern',
score: 100,
risk,
};
routingMetrics.record(decision);
return decision;
}
// Full complexity analysis (pass workspace for code-graph integration)
const useWeightedScoring = config.routing?.weightedScoring ?? false;
const analysis = await analyzeComplexity(payload, { weighted: useWeightedScoring, workspace: options.workspace });
// Phase 4: Optional embeddings adjustment
let embeddingsResult = null;
if (options.useEmbeddings !== false && config.ollama?.embeddingsModel) {
try {
embeddingsResult = await analyzeWithEmbeddings(payload);
if (embeddingsResult?.adjustment) {
analysis.score = Math.max(0, Math.min(100,
analysis.score + embeddingsResult.adjustment
));
analysis.embeddingsAdjustment = embeddingsResult.adjustment;
}
} catch (err) {
logger.debug({ err: err.message }, 'Embeddings analysis failed, using heuristics only');
}
}
// Agentic workflow detection
let agenticResult = null;
if (config.routing?.agenticDetection !== false) {
try {
const detector = getAgenticDetector();
agenticResult = detector.detect(payload);
// Boost complexity score for agentic workflows
if (agenticResult.isAgentic) {
analysis.score = Math.min(100, analysis.score + agenticResult.scoreBoost);
analysis.agenticBoost = agenticResult.scoreBoost;
analysis.agentType = agenticResult.agentType;
logger.debug({
agentType: agenticResult.agentType,
boost: agenticResult.scoreBoost,
newScore: analysis.score,
}, '[Routing] Agentic workflow detected, boosting score');
// Force cloud for autonomous workflows
if (agenticResult.agentType === 'AUTONOMOUS' && isFallbackEnabled()) {
const provider = getBestCloudProvider();
const decision = {
provider,
method: 'agentic',
reason: 'autonomous_workflow',
score: analysis.score,
agenticResult,
risk,
};
routingMetrics.record(decision);
return decision;
}
}
} catch (err) {
logger.debug({ err: err.message }, 'Agentic detection failed');
}
}
// Tier-based model selection
let selectedModel = null;
let tier = null;
if (config.modelTiers?.enabled) {
try {
const selector = getModelTierSelector();
tier = selector.getTier(analysis.score);
// Check if agentic detection requires a higher tier
if (agenticResult?.minTier) {
const agenticTierPriority = TIER_DEFINITIONS[agenticResult.minTier]?.priority || 0;
const currentTierPriority = TIER_DEFINITIONS[tier]?.priority || 0;
if (agenticTierPriority > currentTierPriority) {
tier = agenticResult.minTier;
logger.debug({ from: selector.getTier(analysis.score), to: tier }, '[Routing] Upgrading tier for agentic workflow');
}
}
// Select model for the tier (will be applied after provider selection)
analysis.tier = tier;
} catch (err) {
logger.debug({ err: err.message }, 'Tier selection failed');
}
}
// Apply routing decision based on tier config (TIER_* env vars take precedence
// but Phase 1.2 lets the cost-optimizer pick a cheaper qualifying model when safe).
let provider;
let method = 'tier_config';
let costOptimized = false;
const selector = getModelTierSelector();
const modelSelection = selector.selectModel(tier, null);
provider = modelSelection.provider;
selectedModel = modelSelection.model;
logger.debug({ tier, provider, model: selectedModel }, '[Routing] Using tier config');
// Phase 1.2 — cost-optimizer override.
// Only kick in when:
// - feature flag enabled (default true, disable with LYNKR_COST_OPTIMIZE=false)
// - risk level is not high (high-risk keeps the explicitly-configured model)
// - the optimizer finds a meaningfully cheaper qualifying model
const costOptimizeEnabled = process.env.LYNKR_COST_OPTIMIZE !== 'false'
&& config.routing?.costOptimize !== false;
if (costOptimizeEnabled && risk?.level !== 'high') {
try {
const optimizer = getCostOptimizer();
const availableProviders = _enabledProviders();
const cheapest = optimizer.findCheapestForTier(tier, availableProviders);
if (cheapest && cheapest.model && cheapest.model !== selectedModel) {
const current = optimizer.estimateCost(selectedModel, 1000);
const candidate = optimizer.estimateCost(cheapest.model, 1000);
if (candidate.totalEstimate > 0 && candidate.totalEstimate < current.totalEstimate * 0.75) {
logger.debug({
tier,
from: `${provider}:${selectedModel}`,
to: `${cheapest.provider}:${cheapest.model}`,
savedPerK: (current.totalEstimate - candidate.totalEstimate).toFixed(6),
}, '[Routing] Cost-optimizer override');
provider = cheapest.provider;
selectedModel = cheapest.model;
method = 'tier_config+cost_optimized';
costOptimized = true;
}
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Cost-optimize failed, keeping tier_config selection');
}
}
// Phase 1.3 — context window validation. If estimated tokens exceed the
// selected model's context (with response headroom), escalate to a
// context-capable model regardless of tier.
try {
const estimatedTokens = countPayloadTokens(payload, selectedModel);
const ctxResult = contextValidator.validate(selectedModel, estimatedTokens);
if (!ctxResult.ok) {
const capable = selector.findContextCapable(estimatedTokens, tier);
if (capable) {
logger.info({
from: `${provider}:${selectedModel}`,
to: `${capable.provider}:${capable.model}`,
required: estimatedTokens,
oldContext: ctxResult.context,
newContext: capable.context,
}, '[Routing] Context window escalation');
provider = capable.provider;
selectedModel = capable.model;
if (capable.tier) tier = capable.tier;
method = method + '+context_escalated';
} else {
logger.warn({
model: selectedModel,
required: estimatedTokens,
available: ctxResult.context,
}, '[Routing] No context-capable fallback — request may fail upstream');
}
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Context validation failed, proceeding without check');
}
// Phase 1.4 — vision capability guard.
// If the payload contains image content blocks but the selected model lacks
// vision support, silently swap to the cheapest vision-capable model at or
// above the current tier. Prevents silent upstream failures.
if (_payloadHasImages(payload)) {
try {
const { getModelRegistrySync } = require('./model-registry');
const registry = getModelRegistrySync();
const modelInfo = registry.getCost(selectedModel);
if (!modelInfo?.vision) {
const visionModel = selector.findVisionCapable(tier);
if (visionModel) {
logger.info({
from: `${provider}:${selectedModel}`,
to: `${visionModel.provider}:${visionModel.model}`,
tier: visionModel.tier,
}, '[Routing] Vision guard — upgrading to vision-capable model');
provider = visionModel.provider;
selectedModel = visionModel.model;
if (visionModel.tier !== tier) tier = visionModel.tier;
method = method + '+vision_guard';
} else {
logger.warn({ model: selectedModel }, '[Routing] Vision guard — no vision-capable model found, request may fail');
}
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Vision guard check failed, proceeding');
}
}
// Phase 3.1 — kNN routing hint.
// If the index has enough entries, query it with the last user message.
// A high-confidence kNN suggestion overrides the heuristic selection.
let knnResult = null;
if (config.routing?.knnEnabled !== false) {
try {
const msgs = payload?.messages;
const lastMsg = Array.isArray(msgs) ? msgs[msgs.length - 1]?.content : null;
const queryText = typeof lastMsg === 'string' ? lastMsg
: Array.isArray(lastMsg) ? lastMsg.filter(b => b?.type === 'text').map(b => b.text || '').join(' ')
: null;
if (queryText) {
knnResult = await getKnnRouter().query(queryText);
if (knnResult && knnResult.confidence > 0.7 && knnResult.model && knnResult.model !== selectedModel) {
// High confidence — trust kNN's model recommendation directly.
logger.debug({
from: `${provider}:${selectedModel}`,
to: `${knnResult.provider}:${knnResult.model}`,
confidence: knnResult.confidence.toFixed(3),
}, '[Routing] kNN override');
provider = knnResult.provider;
selectedModel = knnResult.model;
method = method + '+knn';
} else if (knnResult && knnResult.confidence > 0.4 && knnResult.confidence <= 0.7) {
// Ambiguous signal — neighbors are split, we can't trust any single model
// recommendation. Err on quality: bump the current tier one step up so the
// request gets a more capable model rather than risking a bad answer from
// a model that was borderline for similar past requests.
const TIER_ORDER = ['SIMPLE', 'MEDIUM', 'COMPLEX', 'REASONING'];
const currentIdx = TIER_ORDER.indexOf(tier);
if (currentIdx >= 0 && currentIdx < TIER_ORDER.length - 1) {
const upgradedTier = TIER_ORDER[currentIdx + 1];
try {
const upgraded = selector.selectModel(upgradedTier, null);
logger.debug({
from: `${tier}:${provider}:${selectedModel}`,
to: `${upgradedTier}:${upgraded.provider}:${upgraded.model}`,
confidence: knnResult.confidence.toFixed(3),
}, '[Routing] kNN ambiguous — escalating tier for safety');
provider = upgraded.provider;
selectedModel = upgraded.model;
tier = upgradedTier;
method = method + '+knn_ambiguous_escalate';
} catch (err) {
logger.debug({ err: err.message }, '[Routing] kNN ambiguous escalation failed, keeping current tier');
}
}
}
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] kNN query failed, ignoring');
}
}
// Phase 4.1 — LinUCB bandit intra-tier selection.
// When there are two candidates (heuristic vs kNN), the bandit picks the
// one with the highest estimated UCB score for the current context.
if (config.routing?.banditEnabled !== false && knnResult && knnResult.model) {
try {
// Build candidates: current selection and kNN alternative if different
const allCandidates = [{ provider, model: selectedModel }];
if (knnResult.model !== selectedModel) {
allCandidates.push({ provider: knnResult.provider, model: knnResult.model });
}
if (allCandidates.length > 1) {
const bandit = getBandit();
const TASK_TYPES = ['code_gen', 'summarization', 'reasoning', 'factoid', 'chat', 'other'];
const inferredTask = (analysis.breakdown?.taskType?.reason || 'other').toLowerCase();
const taskIdx = Math.max(0, TASK_TYPES.findIndex(t => inferredTask.includes(t)));
const ctx = [
(analysis.score || 0) / 100,
Math.log(Math.max(1, analysis.breakdown?.tokenCount || 0) + 1) / 15,
((payload?.tools?.length ?? 0) > 0) ? 1 : 0,
options.streaming ? 1 : 0,
risk?.level === 'high' ? 1 : risk?.level === 'medium' ? 0.5 : 0,
agenticResult?.isAgentic ? 1 : 0,
...TASK_TYPES.map((_, i) => i === taskIdx ? 1 : 0),
];
const picked = bandit.pick(tier, allCandidates, ctx);
if (picked && picked.model !== selectedModel) {
logger.debug({
from: `${provider}:${selectedModel}`,
to: `${picked.provider}:${picked.model}`,
ucb: picked.ucb?.toFixed(4),
explored: picked.explored,
}, '[Routing] Bandit override');
provider = picked.provider;
selectedModel = picked.model;
method = method + (picked.explored ? '+bandit_explore' : '+bandit');
}
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Bandit pick failed, ignoring');
}
}
// Phase 6.3 — deadline-aware fastest-model selection.
// Payload carries _deadlineMs injected by the orchestrator from the
// LYNKR-Deadline-Ms request header.
const deadlineMs = payload?._deadlineMs ?? null;
if (deadlineMs) {
try {
const fastest = chooseFastest([{ provider, model: selectedModel }], deadlineMs);
if (fastest && fastest.model !== selectedModel) {
logger.debug({
from: `${provider}:${selectedModel}`,
to: `${fastest.provider}:${fastest.model}`,
deadlineMs,
}, '[Routing] Deadline override');
provider = fastest.provider;
selectedModel = fastest.model;
method = method + '+deadline';
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Deadline check failed, ignoring');
}
}
// Phase 6.1 — per-tenant policy overrides.
// tenantPolicy comes from options (threaded from Express res.locals via
// orchestrator → databricks → here).
if (options.tenantPolicy) {
try {
const overridden = applyTenantOverrides(
{ provider, model: selectedModel, tier, method },
options.tenantPolicy,
);
if (overridden && overridden.model !== selectedModel) {
logger.debug({
from: `${provider}:${selectedModel}`,
to: `${overridden.provider}:${overridden.model}`,
}, '[Routing] Tenant override');
provider = overridden.provider;
selectedModel = overridden.model;
method = overridden.method;
}
} catch (err) {
logger.debug({ err: err.message }, '[Routing] Tenant override failed, ignoring');
}
}
const decision = {
provider,
model: selectedModel,
tier,
method,
reason: analysis.recommendation,
score: analysis.score,
threshold: analysis.threshold,
mode: analysis.mode,
analysis,
embeddingsResult,
agenticResult,
costOptimized,
risk,
knnResult,
};
// Phase 4.4 — shadow-mode policy comparison (fire-and-forget).
const shadowFn = getShadowPolicy();
if (shadowFn) {
setImmediate(() =>
shadowCompareAndLog({ payload, activeDecision: decision, shadowFn }).catch(() => {})
);
}
// Phase 3: Record metrics
routingMetrics.record(decision);
logger.debug(
{
provider,
score: analysis.score,
threshold: analysis.threshold,
recommendation: analysis.recommendation,
taskType: analysis.breakdown?.taskType?.reason,
toolCount: payload?.tools?.length ?? 0,
},
'Smart routing decision'
);
return decision;
}
/**
* Get routing headers to include in response
* Phase 3: Expose routing decision to clients
*/
function getRoutingHeaders(decision) {
const headers = {
'X-Lynkr-Routing-Method': decision.method || 'unknown',
'X-Lynkr-Provider': decision.provider || 'unknown',
};
if (typeof decision.score === 'number') {
headers['X-Lynkr-Complexity-Score'] = String(decision.score);
}
if (decision.threshold) {
headers['X-Lynkr-Complexity-Threshold'] = String(decision.threshold);
}
if (decision.reason) {
headers['X-Lynkr-Routing-Reason'] = decision.reason;
}
// Tier and model headers
if (decision.tier) {
headers['X-Lynkr-Tier'] = decision.tier;
}
if (decision.model) {
headers['X-Lynkr-Model'] = decision.model;
}
if (decision.agenticResult?.isAgentic) {
headers['X-Lynkr-Agentic'] = decision.agenticResult.agentType;
}
if (decision.costOptimized) {
headers['X-Lynkr-Cost-Optimized'] = 'true';
}
if (decision.risk?.level) {
headers['X-Lynkr-Risk'] = decision.risk.level;
const hits = Array.from(new Set([
...(decision.risk.instructionHits || []),
...(decision.risk.pathHits || []),
]));
if (hits.length > 0) {
// Header values are ASCII-only; comma-join the first few hits.
headers['X-Lynkr-Risk-Hits'] = hits.slice(0, 8).join(',');
}
}
return headers;
}
/**
* Get routing statistics
* Phase 3: Metrics access
*/
function getRoutingStats() {
return routingMetrics.getStats();
}
module.exports = {
// Main routing function
determineProviderSmart,
// Helpers
isFallbackEnabled,
getFallbackProvider,
getBestCloudProvider,
getBestLocalProvider,
isLocalProvider,
// Phase 3: Headers and metrics
getRoutingHeaders,
getRoutingStats,
// Re-export analyzer for direct access
analyzeComplexity: require('./complexity-analyzer').analyzeComplexity,
analyzeRisk,
// Intelligent routing modules
getAgenticDetector,
getModelTierSelector,
getCostOptimizer,
AGENT_TYPES,
TIER_DEFINITIONS,
// Phase 3-6 modules
getKnnRouter,
getBandit,
getShadowPolicy,
shadowCompareAndLog,
chooseFastest,
applyTenantOverrides,
// Telemetry
telemetry,
scoreResponseQuality,
getLatencyTracker,
};