UNPKG

lynkr

Version:

Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.

309 lines (263 loc) 8.47 kB
/** * Cost Optimizer Module * Tracks and optimizes LLM costs across providers * Uses ModelRegistry for dynamic pricing data */ const logger = require('../logger'); const config = require('../config'); const { getModelRegistry, getModelRegistrySync } = require('./model-registry'); const { getModelTierSelector, TIER_DEFINITIONS } = require('./model-tiers'); const { ratioFor } = require('./output-ratios'); // Session cost tracking (in-memory) const sessionCosts = new Map(); // sessionId -> { total, requests, byModel, byProvider } // Global stats const globalStats = { totalCost: 0, totalSavings: 0, requestCount: 0, byProvider: {}, byTier: {}, }; class CostOptimizer { constructor() { this.registry = null; this.tierSelector = null; } /** * Initialize with registry (async) */ async initialize() { this.registry = await getModelRegistry(); this.tierSelector = getModelTierSelector(); } /** * Get registry (sync fallback) */ _getRegistry() { if (!this.registry) { this.registry = getModelRegistrySync(); } return this.registry; } /** * Get tier selector */ _getTierSelector() { if (!this.tierSelector) { this.tierSelector = getModelTierSelector(); } return this.tierSelector; } /** * Estimate cost for a request before sending * @param {string} model - Model name * @param {number} inputTokens - Estimated input tokens * @param {number} outputTokens - Estimated output tokens (optional) * @returns {Object} Cost estimate */ estimateCost(model, inputTokens, outputTokens = null, taskType = null) { const registry = this._getRegistry(); const costs = registry.getCost(model); const inputCost = (inputTokens / 1_000_000) * costs.input; // Phase 2.3: per-task-type output ratio learned from telemetry const ratio = taskType ? ratioFor(taskType) : 0.5; const estimatedOutputTokens = outputTokens || Math.min(inputTokens * ratio, costs.maxOutput || 4096); const outputCost = (estimatedOutputTokens / 1_000_000) * costs.output; return { inputCost: Math.round(inputCost * 1_000_000) / 1_000_000, outputCost: Math.round(outputCost * 1_000_000) / 1_000_000, totalEstimate: Math.round((inputCost + outputCost) * 1_000_000) / 1_000_000, model, inputTokens, outputTokens: estimatedOutputTokens, pricePerMillion: { input: costs.input, output: costs.output, }, source: costs.source, }; } /** * Find cheapest model capable of handling a complexity tier * @param {string} requiredTier - Minimum tier required * @param {string[]} availableProviders - Providers to consider * @returns {Object|null} Cheapest model info */ findCheapestForTier(requiredTier, availableProviders) { const registry = this._getRegistry(); const tierSelector = this._getTierSelector(); const tierOrder = ['SIMPLE', 'MEDIUM', 'COMPLEX', 'REASONING']; const minTierIndex = tierOrder.indexOf(requiredTier); if (minTierIndex === -1) { logger.warn({ tier: requiredTier }, '[CostOptimizer] Unknown tier'); return null; } const candidates = []; // Collect models from all capable tiers (>= required tier) for (let i = minTierIndex; i < tierOrder.length; i++) { const tier = tierOrder[i]; for (const provider of availableProviders) { const models = tierSelector.getPreferredModels(tier, provider); for (const model of models) { const cost = registry.getCost(model); const totalCost = cost.input + cost.output; // Simple cost metric candidates.push({ model, provider, tier, inputCost: cost.input, outputCost: cost.output, totalCost, context: cost.context, source: cost.source, }); } } } if (candidates.length === 0) { return null; } // Sort by total cost (input + output per 1M tokens) candidates.sort((a, b) => a.totalCost - b.totalCost); const cheapest = candidates[0]; logger.debug({ requiredTier, selectedModel: cheapest.model, selectedProvider: cheapest.provider, cost: cheapest.totalCost, candidateCount: candidates.length, }, '[CostOptimizer] Found cheapest model'); return cheapest; } /** * Record actual cost after response * @param {string} sessionId - Session identifier * @param {string} provider - Provider used * @param {string} model - Model used * @param {number} inputTokens - Actual input tokens * @param {number} outputTokens - Actual output tokens * @param {string} tier - Complexity tier * @returns {number} Actual cost */ recordCost(sessionId, provider, model, inputTokens, outputTokens, tier = 'MEDIUM') { const registry = this._getRegistry(); const costs = registry.getCost(model); const inputCost = (inputTokens / 1_000_000) * costs.input; const outputCost = (outputTokens / 1_000_000) * costs.output; const actualCost = inputCost + outputCost; // Update session costs if (sessionId) { if (!sessionCosts.has(sessionId)) { sessionCosts.set(sessionId, { total: 0, requests: 0, byModel: {}, byProvider: {}, byTier: {}, }); } const session = sessionCosts.get(sessionId); session.total += actualCost; session.requests++; session.byModel[model] = (session.byModel[model] || 0) + actualCost; session.byProvider[provider] = (session.byProvider[provider] || 0) + actualCost; session.byTier[tier] = (session.byTier[tier] || 0) + actualCost; } // Update global stats globalStats.totalCost += actualCost; globalStats.requestCount++; globalStats.byProvider[provider] = (globalStats.byProvider[provider] || 0) + actualCost; globalStats.byTier[tier] = (globalStats.byTier[tier] || 0) + actualCost; logger.debug({ sessionId, provider, model, inputTokens, outputTokens, cost: actualCost.toFixed(6), tier, }, '[CostOptimizer] Recorded cost'); return actualCost; } /** * Calculate potential savings from routing optimization */ calculateSavings(originalModel, optimizedModel, tokens) { const registry = this._getRegistry(); const originalCost = registry.getCost(originalModel); const optimizedCost = registry.getCost(optimizedModel); const originalTotal = (tokens / 1_000_000) * (originalCost.input + originalCost.output); const optimizedTotal = (tokens / 1_000_000) * (optimizedCost.input + optimizedCost.output); const savings = originalTotal - optimizedTotal; if (savings > 0) { globalStats.totalSavings += savings; } return { originalCost: originalTotal, optimizedCost: optimizedTotal, savings: Math.max(0, savings), percentSaved: originalTotal > 0 ? (savings / originalTotal) * 100 : 0, }; } /** * Get session cost summary */ getSessionCost(sessionId) { return sessionCosts.get(sessionId) || { total: 0, requests: 0, byModel: {}, byProvider: {}, byTier: {}, }; } /** * Get global stats */ getStats() { return { ...globalStats, sessionCount: sessionCosts.size, avgCostPerRequest: globalStats.requestCount > 0 ? (globalStats.totalCost / globalStats.requestCount).toFixed(6) : '0', totalCostFormatted: `$${globalStats.totalCost.toFixed(4)}`, totalSavingsFormatted: `$${globalStats.totalSavings.toFixed(4)}`, }; } /** * Clear session data (for cleanup) */ clearSession(sessionId) { sessionCosts.delete(sessionId); } /** * Reset all stats (for testing) */ resetStats() { sessionCosts.clear(); globalStats.totalCost = 0; globalStats.totalSavings = 0; globalStats.requestCount = 0; globalStats.byProvider = {}; globalStats.byTier = {}; } } // Singleton instance let instance = null; function getCostOptimizer() { if (!instance) { instance = new CostOptimizer(); } return instance; } async function getCostOptimizerAsync() { const optimizer = getCostOptimizer(); await optimizer.initialize(); return optimizer; } module.exports = { CostOptimizer, getCostOptimizer, getCostOptimizerAsync, };