UNPKG

lynkr

Version:

Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.

501 lines (422 loc) 16.1 kB
/** * Model Registry * Multi-source pricing: LiteLLM -> models.dev -> Databricks fallback * Caches data locally with 24h TTL */ const fs = require('fs'); const path = require('path'); const logger = require('../logger'); // API URLs const LITELLM_URL = 'https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json'; const MODELS_DEV_URL = 'https://models.dev/api.json'; // Cache settings const CACHE_FILE = path.join(__dirname, '../../data/model-prices-cache.json'); const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours // Databricks fallback pricing (based on Anthropic direct API prices) const DATABRICKS_FALLBACK = { // Claude models 'databricks-claude-opus-4-6': { input: 5.0, output: 25.0, context: 1000000 }, 'databricks-claude-opus-4-5': { input: 5.0, output: 25.0, context: 200000 }, 'databricks-claude-opus-4-1': { input: 15.0, output: 75.0, context: 200000 }, 'databricks-claude-sonnet-4-5': { input: 3.0, output: 15.0, context: 200000 }, 'databricks-claude-sonnet-4': { input: 3.0, output: 15.0, context: 200000 }, 'databricks-claude-3-7-sonnet': { input: 3.0, output: 15.0, context: 200000 }, 'databricks-claude-haiku-4-5': { input: 1.0, output: 5.0, context: 200000 }, // Llama models 'databricks-llama-4-maverick': { input: 1.0, output: 1.0, context: 128000 }, 'databricks-meta-llama-3-3-70b-instruct': { input: 0.9, output: 0.9, context: 128000 }, 'databricks-meta-llama-3-1-405b-instruct': { input: 2.0, output: 2.0, context: 128000 }, 'databricks-meta-llama-3-1-8b-instruct': { input: 0.2, output: 0.2, context: 128000 }, // GPT models via Databricks 'databricks-gpt-5-2': { input: 5.0, output: 15.0, context: 200000 }, 'databricks-gpt-5-1': { input: 3.0, output: 12.0, context: 200000 }, 'databricks-gpt-5': { input: 2.5, output: 10.0, context: 128000 }, 'databricks-gpt-5-mini': { input: 0.5, output: 1.5, context: 128000 }, 'databricks-gpt-5-nano': { input: 0.15, output: 0.6, context: 128000 }, // Gemini models via Databricks 'databricks-gemini-3-flash': { input: 0.075, output: 0.3, context: 1000000 }, 'databricks-gemini-3-pro': { input: 1.25, output: 5.0, context: 2000000 }, 'databricks-gemini-2-5-pro': { input: 1.25, output: 5.0, context: 1000000 }, 'databricks-gemini-2-5-flash': { input: 0.075, output: 0.3, context: 1000000 }, // DBRX 'databricks-dbrx-instruct': { input: 0.75, output: 2.25, context: 32000 }, // Embedding models (price per 1M tokens) 'databricks-gte-large-en': { input: 0.02, output: 0, context: 8192 }, 'databricks-bge-large-en': { input: 0.02, output: 0, context: 512 }, }; // Default cost for unknown models. Returned with `unknown: true` so callers can // distinguish a real price from a fabricated guess. const DEFAULT_COST = { input: 1.0, output: 3.0, context: 128000 }; // Curated name aliases (exact, one-directional). Maps a name a caller might use // to the canonical key likely present in the pricing data. Misses are harmless // (resolution simply continues down the ladder). const MODEL_ALIASES = { 'claude-sonnet-4-5': 'claude-sonnet-4-5-20250929', 'claude-opus-4-1': 'claude-opus-4-1-20250805', 'claude-3-5-sonnet': 'claude-3-5-sonnet-20241022', }; /** * Parse MODEL_PRICE_OVERRIDES env (JSON object of * { "<model>": { "input": <usd/1M>, "output": <usd/1M>, "context"?: N } }). * Lets operators pin correct prices for models the registry doesn't know. */ function _loadOverrides() { const out = new Map(); const raw = process.env.MODEL_PRICE_OVERRIDES; if (!raw) return out; try { const parsed = JSON.parse(raw); for (const [name, info] of Object.entries(parsed)) { if (info && typeof info.input === 'number' && typeof info.output === 'number') { out.set(name.toLowerCase(), { context: 128000, ...info }); } } } catch (err) { logger.warn({ err: err.message }, '[ModelRegistry] Failed to parse MODEL_PRICE_OVERRIDES'); } return out; } class ModelRegistry { constructor() { this.litellmPrices = {}; this.modelsDevPrices = {}; this.loaded = false; this.lastFetch = 0; this.modelIndex = new Map(); this.overrides = _loadOverrides(); } /** * Initialize registry - load from cache or fetch fresh data */ async initialize() { if (this.loaded) return; // Try cache first if (this._loadFromCache()) { this.loaded = true; // Background refresh if stale if (Date.now() - this.lastFetch > CACHE_TTL_MS) { this._fetchAll().catch(err => logger.warn({ err: err.message }, '[ModelRegistry] Background refresh failed') ); } return; } // Fetch fresh data await this._fetchAll(); this.loaded = true; } /** * Fetch from both sources */ async _fetchAll() { const results = await Promise.allSettled([ this._fetchLiteLLM(), this._fetchModelsDev(), ]); const litellmOk = results[0].status === 'fulfilled'; const modelsDevOk = results[1].status === 'fulfilled'; if (litellmOk || modelsDevOk) { this._buildIndex(); this._saveToCache(); this.lastFetch = Date.now(); logger.info({ litellm: litellmOk ? Object.keys(this.litellmPrices).length : 0, modelsDev: modelsDevOk ? Object.keys(this.modelsDevPrices).length : 0, total: this.modelIndex.size, }, '[ModelRegistry] Loaded pricing data'); } else { logger.warn('[ModelRegistry] All sources failed, using Databricks fallback only'); } } /** * Fetch LiteLLM pricing */ async _fetchLiteLLM() { try { const response = await fetch(LITELLM_URL, { signal: AbortSignal.timeout(15000), headers: { 'Accept': 'application/json' }, }); if (!response.ok) throw new Error(`HTTP ${response.status}`); const data = await response.json(); this.litellmPrices = this._processLiteLLM(data); logger.debug({ count: Object.keys(this.litellmPrices).length }, '[ModelRegistry] LiteLLM loaded'); } catch (err) { logger.warn({ err: err.message }, '[ModelRegistry] LiteLLM fetch failed'); throw err; } } /** * Process LiteLLM format into our format * LiteLLM uses cost per token, we use cost per 1M tokens */ _processLiteLLM(data) { const prices = {}; for (const [modelId, info] of Object.entries(data)) { if (!info || typeof info !== 'object') continue; // Convert per-token to per-million-tokens const inputCost = (info.input_cost_per_token || 0) * 1_000_000; const outputCost = (info.output_cost_per_token || 0) * 1_000_000; prices[modelId.toLowerCase()] = { input: inputCost, output: outputCost, context: info.max_input_tokens || info.max_tokens || 128000, maxOutput: info.max_output_tokens || 4096, toolCall: info.supports_function_calling ?? true, vision: info.supports_vision ?? false, source: 'litellm', }; // Also index without provider prefix for flexible lookup const shortName = modelId.split('/').pop().toLowerCase(); if (shortName !== modelId.toLowerCase()) { prices[shortName] = prices[modelId.toLowerCase()]; } } return prices; } /** * Fetch models.dev pricing */ async _fetchModelsDev() { try { const response = await fetch(MODELS_DEV_URL, { signal: AbortSignal.timeout(15000), headers: { 'Accept': 'application/json' }, }); if (!response.ok) throw new Error(`HTTP ${response.status}`); const data = await response.json(); this.modelsDevPrices = this._processModelsDev(data); logger.debug({ count: Object.keys(this.modelsDevPrices).length }, '[ModelRegistry] models.dev loaded'); } catch (err) { logger.warn({ err: err.message }, '[ModelRegistry] models.dev fetch failed'); throw err; } } /** * Process models.dev format into our format */ _processModelsDev(data) { const prices = {}; for (const [providerId, providerData] of Object.entries(data)) { if (!providerData?.models) continue; for (const [modelId, info] of Object.entries(providerData.models)) { const fullId = `${providerId}/${modelId}`.toLowerCase(); prices[fullId] = { input: info.cost?.input || 0, output: info.cost?.output || 0, cacheRead: info.cost?.cache_read, cacheWrite: info.cost?.cache_write, context: info.context || 128000, maxOutput: info.output || 4096, toolCall: info.tool_call ?? false, reasoning: info.reasoning ?? false, vision: Array.isArray(info.input) && info.input.includes('image'), source: 'models.dev', }; // Also index by short name prices[modelId.toLowerCase()] = prices[fullId]; } } return prices; } /** * Build unified index from all sources */ _buildIndex() { this.modelIndex.clear(); // Add Databricks fallback first (lowest priority) for (const [modelId, info] of Object.entries(DATABRICKS_FALLBACK)) { this.modelIndex.set(modelId.toLowerCase(), { ...info, source: 'databricks-fallback' }); } // Add models.dev (medium priority) for (const [modelId, info] of Object.entries(this.modelsDevPrices)) { this.modelIndex.set(modelId, info); } // Add LiteLLM (highest priority) for (const [modelId, info] of Object.entries(this.litellmPrices)) { this.modelIndex.set(modelId, info); } } /** * Get cost for a model * @param {string} modelName - Model name/ID * @returns {Object} Cost info { input, output, context, ... } */ getCost(modelName) { if (!modelName) return { ...DEFAULT_COST, source: 'default', unknown: true }; const name = String(modelName).toLowerCase().trim(); const hit = this._resolveCost(name); if (hit) return hit; // Nothing matched — report unknown rather than silently fabricating a price. logger.debug({ model: modelName }, '[ModelRegistry] Model not found — cost unknown'); return { ...DEFAULT_COST, source: 'default', unknown: true }; } /** * Deterministic price resolution. Each step is exact (no bidirectional * substring matching), and the only loose step (longest-prefix) is * one-directional and length-bounded, so unrelated names can't false-match. * Returns a cost object with a `resolution` tag, or null if nothing matched. * @param {string} name - already lowercased/trimmed */ _resolveCost(name) { const tag = (value, resolution, matchedAs) => ({ ...value, resolution, ...(matchedAs && matchedAs !== name ? { matchedAs } : {}), }); // 1. Operator overrides (exact) — ground truth. if (this.overrides.has(name)) return tag({ ...this.overrides.get(name), source: 'override' }, 'override'); // 2. Exact registry hit. if (this.modelIndex.has(name)) return tag(this.modelIndex.get(name), 'exact'); // 3. Provider-prefix strip (exact). const stripped = [ name.replace(/^databricks-/, ''), name.replace(/^azure\//, ''), name.replace(/^bedrock\//, ''), name.replace(/^anthropic\./, ''), name.replace(/^openai\//, ''), name.includes('/') ? name.split('/').pop() : null, ].filter((v) => v && v !== name); for (const v of stripped) { if (this.overrides.has(v)) return tag({ ...this.overrides.get(v), source: 'override' }, 'prefix-strip', v); if (this.modelIndex.has(v)) return tag(this.modelIndex.get(v), 'prefix-strip', v); } // 4. Curated alias (exact). const alias = MODEL_ALIASES[name]; if (alias && this.modelIndex.has(alias)) return tag(this.modelIndex.get(alias), 'alias', alias); // 5. Date/version-suffix normalization (e.g. -20250929, -2025-09-29, -v2). const dateless = name.replace(/[-@](\d{8}|\d{4}-\d{2}-\d{2}|v\d+)$/, ''); if (dateless !== name && this.modelIndex.has(dateless)) return tag(this.modelIndex.get(dateless), 'date-normalize', dateless); // 6. Longest registry key that is a prefix of the requested name. Bounded so // short keys can't grab unrelated names (e.g. "gpt-5.2-chat-2026" → "gpt-5.2-chat"). let best = null; for (const [key, value] of this.modelIndex.entries()) { if (key.length >= 6 && name.startsWith(key) && (!best || key.length > best.key.length)) { best = { key, value }; } } if (best) return tag(best.value, 'longest-prefix', best.key); return null; } /** * Get model info by name */ getModel(modelName) { return this.getCost(modelName); } /** * Check if model is free (local) */ isFree(modelName) { const cost = this.getCost(modelName); return cost.input === 0 && cost.output === 0; } /** * Check if model supports tool calling */ supportsTools(modelName) { const model = this.getCost(modelName); return model.toolCall === true; } /** * Find models matching criteria */ findModels(criteria = {}) { const results = []; for (const [modelId, info] of this.modelIndex.entries()) { if (criteria.maxInputCost && info.input > criteria.maxInputCost) continue; if (criteria.minContext && info.context < criteria.minContext) continue; if (criteria.toolCall && !info.toolCall) continue; if (criteria.reasoning && !info.reasoning) continue; if (criteria.vision && !info.vision) continue; results.push({ modelId, ...info }); } // Sort by input cost ascending return results.sort((a, b) => a.input - b.input); } /** * Get stats for metrics endpoint */ getStats() { const sources = { litellm: 0, 'models.dev': 0, 'databricks-fallback': 0, default: 0 }; for (const info of this.modelIndex.values()) { const source = info.source || 'default'; sources[source] = (sources[source] || 0) + 1; } return { totalModels: this.modelIndex.size, bySource: sources, lastFetch: this.lastFetch, cacheAge: this.lastFetch ? Date.now() - this.lastFetch : null, cacheTTL: CACHE_TTL_MS, }; } /** * Force refresh from APIs */ async refresh() { await this._fetchAll(); } // Cache management _loadFromCache() { try { if (!fs.existsSync(CACHE_FILE)) return false; const cache = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8')); this.litellmPrices = cache.litellm || {}; this.modelsDevPrices = cache.modelsDev || {}; this.lastFetch = cache.timestamp || 0; this._buildIndex(); logger.debug({ age: Math.round((Date.now() - this.lastFetch) / 60000) + 'min', models: this.modelIndex.size, }, '[ModelRegistry] Loaded from cache'); return true; } catch (err) { logger.debug({ err: err.message }, '[ModelRegistry] Cache load failed'); return false; } } _saveToCache() { try { const dir = path.dirname(CACHE_FILE); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } const cache = { litellm: this.litellmPrices, modelsDev: this.modelsDevPrices, timestamp: Date.now(), }; fs.writeFileSync(CACHE_FILE, JSON.stringify(cache, null, 2)); logger.debug('[ModelRegistry] Cache saved'); } catch (err) { logger.warn({ err: err.message }, '[ModelRegistry] Cache save failed'); } } } // Singleton with lazy initialization let instance = null; async function getModelRegistry() { if (!instance) { instance = new ModelRegistry(); await instance.initialize(); } return instance; } // Sync getter (uses cache only, no network) function getModelRegistrySync() { if (!instance) { instance = new ModelRegistry(); instance._loadFromCache(); instance._buildIndex(); instance.loaded = true; } return instance; } module.exports = { ModelRegistry, getModelRegistry, getModelRegistrySync, DATABRICKS_FALLBACK, DEFAULT_COST, };