UNPKG

lynkr

Version:

Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.

450 lines (399 loc) 24 kB
#!/usr/bin/env node /** * Full-Stack Benchmark: Lynkr vs LiteLLM vs Portkey * * Tests 6 scenarios that cover Lynkr's full optimization stack: * 1. Simple Q&A → tier routing only * 2. Tool-heavy request → smart tool selection (50-70% token reduction) * 3. Long history → history compression * 4. Large payload → TOON compression * 5. Repeated prompts → semantic cache (2nd call should be ~0 tokens billed) * 6. Reasoning request → tier routing to top model * * LiteLLM and Portkey send tokens as-is. Lynkr compresses before the model sees them. * The delta in input_tokens IS the compression saving. * * Usage: * ANTHROPIC_API_KEY=sk-ant-... \ * LITELLM_MASTER_KEY=sk-1234 \ * PORTKEY_API_KEY=your-key \ * node benchmark-tier-routing.js */ // ─── Proxy config ───────────────────────────────────────────────────────────── const PROXIES = [ { name: 'Lynkr', url: process.env.LYNKR_URL ?? 'http://localhost:8081', apiKey: process.env.ANTHROPIC_API_KEY, defaultModel: 'claude-sonnet-4-5', headers: {}, getTier: (_b, h) => h['x-lynkr-tier'] ?? 'unknown', getModel: (_b, h) => h['x-lynkr-model'] ?? h['x-lynkr-provider'] ?? 'unknown', }, { name: 'LiteLLM', url: process.env.LITELLM_URL ?? 'http://localhost:8082', apiKey: process.env.LITELLM_MASTER_KEY ?? 'sk-1234', defaultModel: 'smart-router', headers: {}, getTier: (_b, h) => { const cost = parseFloat(h['x-litellm-response-cost-original'] ?? '0'); if (cost === 0) return 'SIMPLE/MEDIUM (Ollama)'; if (cost < 0.01) return 'MEDIUM (Moonshot)'; return 'COMPLEX/REASONING (Azure)'; }, getModel: (_b, h) => { const cost = parseFloat(h['x-litellm-response-cost-original'] ?? '0'); if (cost === 0) return 'ollama (local/free)'; if (cost < 0.01) return 'moonshot/kimi-k2.6'; return 'azure/gpt-5.2-chat'; }, }, { name: 'Portkey', url: process.env.PORTKEY_URL ?? 'http://localhost:8083', apiKey: process.env.ANTHROPIC_API_KEY, defaultModel: 'claude-sonnet-4-5', headers: { 'x-portkey-provider': 'anthropic', ...(process.env.PORTKEY_API_KEY ? { 'x-portkey-api-key': process.env.PORTKEY_API_KEY } : {}), }, getTier: () => 'N/A', getModel: (b) => b?.model ?? 'claude-sonnet-4-5', }, ]; // ─── Pricing per 1M tokens [input, output] USD ─────────────────────────────── const PRICING = { 'claude-haiku-4-5': [0.80, 4.00], 'claude-haiku-3': [0.25, 1.25], 'claude-sonnet-4-5': [3.00, 15.00], 'claude-sonnet-3-5': [3.00, 15.00], 'claude-opus-4': [15.00, 75.00], 'gpt-4o-mini': [0.15, 0.60], 'gpt-4o': [2.50, 10.00], 'o3-mini': [1.10, 4.40], 'default': [3.00, 15.00], }; function costUsd(model, inputTok, outputTok) { const key = Object.keys(PRICING).find(k => model.toLowerCase().includes(k)) ?? 'default'; const [i, o] = PRICING[key]; return (inputTok / 1e6) * i + (outputTok / 1e6) * o; } // Rough token estimator: 1 token ≈ 4 chars (GPT/Claude rule of thumb) function estimateTokens(payload) { return Math.ceil(JSON.stringify(payload).length / 4); } // ─── Reusable tool definitions (simulate a real Claude Code session) ────────── // 14 tools ≈ 2,500 tokens of tool schema sent on every request without smart selection const TOOL_DEFINITIONS = [ { name: 'Read', description: 'Read a file from disk', input_schema: { type: 'object', properties: { file_path: { type: 'string' }, limit: { type: 'number' } }, required: ['file_path'] } }, { name: 'Write', description: 'Write content to a file', input_schema: { type: 'object', properties: { file_path: { type: 'string' }, content: { type: 'string' } }, required: ['file_path', 'content'] } }, { name: 'Edit', description: 'Make targeted edits to a file', input_schema: { type: 'object', properties: { file_path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['file_path', 'old_string', 'new_string'] } }, { name: 'Bash', description: 'Execute a shell command', input_schema: { type: 'object', properties: { command: { type: 'string' }, timeout: { type: 'number' } }, required: ['command'] } }, { name: 'Glob', description: 'Find files matching a pattern', input_schema: { type: 'object', properties: { pattern: { type: 'string' }, path: { type: 'string' } }, required: ['pattern'] } }, { name: 'Grep', description: 'Search for patterns in files', input_schema: { type: 'object', properties: { pattern: { type: 'string' }, path: { type: 'string' }, glob: { type: 'string' } }, required: ['pattern'] } }, { name: 'WebSearch', description: 'Search the web', input_schema: { type: 'object', properties: { query: { type: 'string' } }, required: ['query'] } }, { name: 'WebFetch', description: 'Fetch a URL', input_schema: { type: 'object', properties: { url: { type: 'string' }, prompt: { type: 'string' } }, required: ['url'] } }, { name: 'TodoWrite', description: 'Write a todo list', input_schema: { type: 'object', properties: { todos: { type: 'array', items: { type: 'object' } } }, required: ['todos'] } }, { name: 'TodoRead', description: 'Read the current todo list', input_schema: { type: 'object', properties: {} } }, { name: 'Task', description: 'Spawn a subagent', input_schema: { type: 'object', properties: { description: { type: 'string' }, prompt: { type: 'string' } }, required: ['description', 'prompt'] } }, { name: 'NotebookRead', description: 'Read a Jupyter notebook', input_schema: { type: 'object', properties: { notebook_path: { type: 'string' } }, required: ['notebook_path'] } }, { name: 'NotebookEdit', description: 'Edit a Jupyter notebook', input_schema: { type: 'object', properties: { notebook_path: { type: 'string' }, cell_index: { type: 'number' }, new_source: { type: 'string' } }, required: ['notebook_path', 'cell_index', 'new_source'] } }, { name: 'mcp__github__create_pull_request', description: 'Create a GitHub pull request via MCP', input_schema: { type: 'object', properties: { title: { type: 'string' }, body: { type: 'string' }, base: { type: 'string' }, head: { type: 'string' } }, required: ['title', 'body'] } }, ]; // ─── Scenarios ──────────────────────────────────────────────────────────────── const SCENARIOS = [ // ── 1. Simple Q&A ───────────────────────────────────────────────────────── { id: 'S1', label: 'Simple Q&A', feature: 'Tier routing → cheap model', buildPayload: (model) => ({ model, max_tokens: 256, messages: [{ role: 'user', content: 'What does git stash do?' }], }), }, // ── 2. Tool-heavy (smart tool selection) ────────────────────────────────── // All 14 tools sent — Lynkr strips irrelevant ones before forwarding { id: 'T1', label: 'Tool-heavy (14 tools)', feature: 'Smart tool selection → strips unused tools', buildPayload: (model) => ({ model, max_tokens: 512, tools: TOOL_DEFINITIONS, messages: [{ role: 'user', content: 'What does the README say about installation?' }], }), }, { id: 'T2', label: 'Tool-heavy (14 tools) – write task', feature: 'Smart tool selection → keeps only write tools', buildPayload: (model) => ({ model, max_tokens: 512, tools: TOOL_DEFINITIONS, messages: [{ role: 'user', content: 'Edit the config file to set DEBUG=true' }], }), }, // ── 3. Long history (history compression) ───────────────────────────────── // 8-turn conversation — Lynkr compresses older turns before forwarding { id: 'H1', label: 'Long history (8 turns)', feature: 'History compression → dedups older turns', buildPayload: (model) => ({ model, max_tokens: 512, messages: [ { role: 'user', content: 'Can you help me refactor my Express app?' }, { role: 'assistant', content: 'Sure! Let\'s start by reviewing your current structure. What does your folder layout look like?' }, { role: 'user', content: 'I have routes/, controllers/, models/, middleware/ folders.' }, { role: 'assistant', content: 'Good structure. Are you using any ORM, and do you have error handling middleware in place?' }, { role: 'user', content: 'I use Sequelize. Error handling is scattered across controllers right now.' }, { role: 'assistant', content: 'Let\'s centralise error handling first. Create middleware/errorHandler.js and export an express error middleware with four params (err, req, res, next).' }, { role: 'user', content: 'Done. Now I need to add input validation — should I use Joi or express-validator?' }, { role: 'assistant', content: 'For Sequelize projects, Joi pairs well. Install it and create a validate() middleware wrapper.' }, { role: 'user', content: 'Great, now how do I add rate limiting to specific routes only?' }, ], }), }, // ── 4a. TOON – large JSON tool result (file read) ───────────────────────── // Simulates a tool_result block returning a large JSON config file. // TOON specifically compresses JSON structures — this is its primary trigger. { id: 'L1', label: 'TOON – large JSON tool result', feature: 'TOON compression → compresses JSON tool_result before forwarding', buildPayload: (model) => ({ model, max_tokens: 512, tools: [TOOL_DEFINITIONS[0]], // Read tool only messages: [ { role: 'user', content: 'Read package.json and tell me the dependencies.' }, { role: 'assistant', content: null, tool_calls: [{ id: 'tr_001', type: 'function', function: { name: 'Read', arguments: JSON.stringify({ file_path: 'package.json' }) } }] }, { role: 'user', content: [ { type: 'tool_result', tool_use_id: 'tr_001', content: JSON.stringify(generateFakeLargeJsonResult()) }, ]}, { role: 'user', content: 'What are the top-level dependencies?' }, ], }), }, // ── 4b. TOON – large grep/glob JSON result ──────────────────────────────── // Simulates a Bash tool returning a large JSON array of search results. { id: 'L2', label: 'TOON – large JSON grep result (~2k tokens)', feature: 'TOON compression → compresses JSON array tool_result', buildPayload: (model) => ({ model, max_tokens: 512, tools: [TOOL_DEFINITIONS[3]], // Bash tool only messages: [ { role: 'user', content: 'Find all TODO comments in the codebase.' }, { role: 'assistant', content: null, tool_calls: [{ id: 'tr_002', type: 'function', function: { name: 'Bash', arguments: JSON.stringify({ command: 'grep -rn "TODO" src/' }) } }] }, { role: 'user', content: [ { type: 'tool_result', tool_use_id: 'tr_002', content: JSON.stringify(generateFakeGrepResult()) }, ]}, { role: 'user', content: 'Summarise the most important TODOs.' }, ], }), }, // ── 5. Semantic cache (send same prompt twice) ───────────────────────────── // First call: billed normally. Second call: Lynkr returns cached response (0 LLM tokens). { id: 'SC1', label: 'Cache – first call', feature: 'Semantic cache – populates cache', buildPayload: (model) => ({ model, max_tokens: 256, messages: [{ role: 'user', content: 'Explain the difference between TCP and UDP in two sentences.' }], }), }, { id: 'SC2', label: 'Cache – second call (near-identical)', feature: 'Semantic cache – should hit cache → 0 tokens billed', buildPayload: (model) => ({ model, max_tokens: 256, // Slightly paraphrased — semantic cache threshold 0.95 should still match messages: [{ role: 'user', content: 'What is the difference between TCP and UDP? Keep it brief.' }], }), }, // ── 6. Reasoning ────────────────────────────────────────────────────────── { id: 'R1', label: 'Reasoning – security analysis', feature: 'Tier routing → top model + risk classifier', buildPayload: (model) => ({ model, max_tokens: 1024, messages: [{ role: 'user', content: 'Analyse the security trade-offs of storing JWT tokens in localStorage vs httpOnly cookies for a banking application. Step by step.' }], }), }, ]; // ─── JSON payload generators (TOON compresses these, plain text it ignores) ── function generateFakeLargeJsonResult() { // Simulates a package.json with many dependencies — ~1,800 tokens of JSON const deps = {}; const devDeps = {}; const packages = [ 'express','lodash','axios','react','typescript','webpack','babel','eslint', 'jest','mocha','chai','sinon','supertest','dotenv','cors','helmet','morgan', 'winston','pino','joi','yup','zod','mongoose','sequelize','prisma','knex', 'redis','ioredis','bull','agenda','node-cron','socket.io','ws','graphql', 'apollo-server','type-graphql','class-transformer','class-validator','reflect-metadata', ]; packages.forEach((p, i) => { const ver = `^${Math.floor(i/10)+1}.${i%10}.${Math.floor(Math.random()*20)}`; if (i % 3 === 0) devDeps[p] = ver; else deps[p] = ver; }); return { name: 'my-app', version: '1.0.0', scripts: { start: 'node index.js', test: 'jest', build: 'webpack', lint: 'eslint src/' }, dependencies: deps, devDependencies: devDeps, engines: { node: '>=18.0.0' }, keywords: ['api','backend','nodejs'], files: Array.from({ length: 30 }, (_, i) => `src/module${i}.js`), exports: Object.fromEntries(packages.map(p => [`./${p}`, `./dist/${p}/index.js`])), }; } function generateFakeGrepResult() { // Simulates grep -rn "TODO" returning a large JSON array — ~1,200 tokens return Array.from({ length: 60 }, (_, i) => ({ file: `src/${['routes','controllers','models','middleware','utils'][i % 5]}/module${i % 15}.js`, line: Math.floor(Math.random() * 500) + 1, match: `TODO: ${['fix error handling','add validation','refactor this','add tests','update docs','remove hardcoded value','add rate limiting','handle edge case'][i % 8]} — assigned to ${['alice','bob','carol','dave'][i % 4]}`, context: ` // TODO: ${['fix error handling','add validation','refactor this','add tests'][i % 4]}\n function handler${i}(req, res) { return res.json({ status: 'ok' }); }`, })); } // ─── HTTP request ───────────────────────────────────────────────────────────── async function sendRequest(proxy, scenario) { const payload = scenario.buildPayload(proxy.defaultModel); const estimatedInputTokens = estimateTokens(payload.messages) + estimateTokens(payload.tools ?? []); const start = Date.now(); try { const res = await fetch(`${proxy.url}/v1/messages`, { method: 'POST', headers: { 'content-type': 'application/json', 'x-api-key': proxy.apiKey, 'anthropic-version': '2023-06-01', ...proxy.headers, }, body: JSON.stringify(payload), signal: AbortSignal.timeout(90_000), }); const latencyMs = Date.now() - start; const headers = Object.fromEntries(res.headers.entries()); if (!res.ok) { const err = await res.text(); return { ok: false, error: `HTTP ${res.status}: ${err.slice(0, 100)}`, latencyMs, estimatedInputTokens }; } const body = await res.json(); const billedInput = body?.usage?.input_tokens ?? 0; const billedOutput = body?.usage?.output_tokens ?? 0; const model = proxy.getModel(body, headers); const tier = proxy.getTier(body, headers); const cost = costUsd(model, billedInput, billedOutput); const tokensSaved = Math.max(0, estimatedInputTokens - billedInput); const compressionPct = estimatedInputTokens > 0 ? ((tokensSaved / estimatedInputTokens) * 100).toFixed(1) : '0.0'; return { ok: true, tier, model, billedInput, billedOutput, estimatedInputTokens, tokensSaved, compressionPct, cost, latencyMs }; } catch (e) { return { ok: false, error: e.message, latencyMs: Date.now() - start, estimatedInputTokens }; } } // ─── Formatting helpers ─────────────────────────────────────────────────────── const col = (s, w) => String(s ?? '').slice(0, w).padEnd(w); const $ = (n) => `$${n.toFixed(6)}`; // ─── Main ───────────────────────────────────────────────────────────────────── async function runBenchmark() { console.log('\n╔═══════════════════════════════════════════════════════════════════╗'); console.log('║ Full-Stack Benchmark: Lynkr vs LiteLLM vs Portkey ║'); console.log('║ Tests: tier routing · tool selection · history · TOON · cache ║'); console.log('╚═══════════════════════════════════════════════════════════════════╝\n'); // results[proxyName][scenarioId] = result const results = {}; for (const p of PROXIES) results[p.name] = {}; for (const scenario of SCENARIOS) { process.stdout.write(`\n[${scenario.id}] ${scenario.label.padEnd(35)} `); for (const proxy of PROXIES) { process.stdout.write(`${proxy.name}… `); results[proxy.name][scenario.id] = await sendRequest(proxy, scenario); await new Promise(r => setTimeout(r, 400)); } process.stdout.write('✓'); } // ─── Per-Scenario Detail ──────────────────────────────────────────────────── console.log('\n\n\n━━━ PER-SCENARIO DETAIL ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); for (const scenario of SCENARIOS) { console.log(`\n▸ [${scenario.id}] ${scenario.label}`); console.log(` Feature under test: ${scenario.feature}`); console.log(` ${'Proxy'.padEnd(10)} ${'Tier'.padEnd(14)} ${'Model'.padEnd(26)} ${'Est.Tok'.padEnd(9)} ${'Billed'.padEnd(9)} ${'Saved'.padEnd(8)} ${'Compress%'.padEnd(11)} ${'Cost'.padEnd(12)} Latency`); console.log(' ' + '─'.repeat(110)); for (const proxy of PROXIES) { const r = results[proxy.name][scenario.id]; if (!r.ok) { console.log(` ${col(proxy.name,10)} ERROR: ${r.error?.slice(0,80)}`); continue; } console.log( ' ' + col(proxy.name, 10) + col(r.tier, 14) + col(r.model, 26) + col(r.estimatedInputTokens, 9) + col(r.billedInput, 9) + col(r.tokensSaved, 8) + col(r.compressionPct + '%', 11) + col($(r.cost), 12) + `${r.latencyMs}ms` ); } } // ─── Feature-Level Summary ────────────────────────────────────────────────── console.log('\n\n━━━ FEATURE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); const featureGroups = [ { label: 'Tier Routing (S1, R1)', ids: ['S1', 'R1'] }, { label: 'Smart Tool Selection (T1, T2)', ids: ['T1', 'T2'] }, { label: 'History Compression (H1)', ids: ['H1'] }, { label: 'TOON / JSON Tool Results (L1+L2)', ids: ['L1', 'L2'] }, { label: 'Semantic Cache (SC1 + SC2)', ids: ['SC1','SC2'] }, ]; for (const group of featureGroups) { console.log(` ${group.label}`); for (const proxy of PROXIES) { const rs = group.ids.map(id => results[proxy.name][id]).filter(r => r?.ok); if (rs.length === 0) { console.log(` ${proxy.name.padEnd(10)} – no data`); continue; } const totalCost = rs.reduce((s, r) => s + r.cost, 0); const totalSaved = rs.reduce((s, r) => s + r.tokensSaved, 0); const totalEst = rs.reduce((s, r) => s + r.estimatedInputTokens, 0); const avgCompress = totalEst > 0 ? ((totalSaved / totalEst) * 100).toFixed(1) : '0.0'; console.log(` ${proxy.name.padEnd(10)} cost: ${$(totalCost).padEnd(14)} tokens saved: ${String(totalSaved).padEnd(8)} compression: ${avgCompress}%`); } console.log(); } // ─── Overall Cost Summary ─────────────────────────────────────────────────── console.log('\n━━━ OVERALL COST (all scenarios) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); const totals = PROXIES.map(proxy => { const rs = Object.values(results[proxy.name]).filter(r => r?.ok); return { name: proxy.name, cost: rs.reduce((s, r) => s + r.cost, 0), tokensSaved: rs.reduce((s, r) => s + r.tokensSaved, 0), avgLatency: rs.length ? rs.reduce((s, r) => s + r.latencyMs, 0) / rs.length : 0, }; }).sort((a, b) => a.cost - b.cost); const maxCost = Math.max(...totals.map(t => t.cost), 0.000001); const baseline = totals.find(t => t.name === 'Portkey')?.cost ?? totals.find(t => t.cost > 0)?.cost ?? maxCost; for (const t of totals) { const pct = baseline > 0 ? ((baseline - t.cost) / baseline * 100).toFixed(1) : '0.0'; const barLen = maxCost > 0 ? Math.max(1, Math.round((t.cost / maxCost) * 30)) : 1; const bar = '█'.repeat(barLen); console.log(` ${t.name.padEnd(10)} ${$(t.cost).padEnd(14)} ${pct.padStart(5)}% cheaper vs baseline avg ${Math.round(t.avgLatency)}ms ${bar}`); } // ─── Extrapolated: 100k requests/month ───────────────────────────────────── console.log('\n\n━━━ EXTRAPOLATED: 100,000 requests/month ──────────────────────────\n'); console.log(' (same scenario mix × scale factor)\n'); const factor = 100_000 / SCENARIOS.length; for (const t of totals) { const monthly = t.cost * factor; const annualSaving = baseline > 0 ? (baseline - t.cost) * factor * 12 : 0; console.log(` ${t.name.padEnd(10)} ~$${monthly.toFixed(2).padStart(10)}/month ~$${(annualSaving).toFixed(0).padStart(10)}/year saved vs Portkey`); } console.log('\nDone.\n'); } runBenchmark().catch(e => { console.error(e); process.exit(1); });