lynkr
Version:
Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.
330 lines (288 loc) • 8.58 kB
JavaScript
/**
* Headroom Sidecar HTTP Client
*
* HTTP client for communicating with the Headroom compression sidecar.
* Provides message compression, CCR retrieval, and metrics collection.
*/
const logger = require("../logger");
const config = require("../config");
// Metrics tracking
const metrics = {
totalCalls: 0,
successfulCompressions: 0,
skippedCompressions: 0,
failures: 0,
totalTokensSaved: 0,
totalLatencyMs: 0,
ccrRetrievals: 0,
ccrSearches: 0,
};
/**
* Get Headroom configuration
*/
function getConfig() {
return config.headroom;
}
/**
* Check if Headroom is enabled
*/
function isEnabled() {
return config.headroom?.enabled === true;
}
/**
* Check if Headroom sidecar is healthy
*/
async function checkHealth() {
const headroomConfig = getConfig();
if (!isEnabled()) {
return { available: false, reason: "disabled" };
}
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 2000);
const response = await fetch(`${headroomConfig.endpoint}/health`, {
signal: controller.signal,
});
clearTimeout(timeout);
if (response.ok) {
const data = await response.json();
return {
available: data.headroom_loaded === true,
status: data.status,
version: data.headroom_version,
ccrEnabled: data.ccr_enabled,
llmlinguaEnabled: data.llmlingua_enabled,
entriesCached: data.entries_cached,
};
}
return { available: false, reason: "unhealthy", status: response.status };
} catch (err) {
return { available: false, reason: err.message };
}
}
/**
* Estimate tokens in messages (rough approximation: ~4 chars per token)
*/
function estimateTokens(messages) {
const text = JSON.stringify(messages);
return Math.ceil(text.length / 4);
}
/**
* Compress messages using Headroom sidecar
*
* @param {Array} messages - Chat messages in Anthropic format
* @param {Array} tools - Tool definitions
* @param {Object} options - Compression options
* @returns {Object} { messages, tools, compressed, stats }
*/
async function compressMessages(messages, tools = [], options = {}) {
const headroomConfig = getConfig();
metrics.totalCalls++;
if (!isEnabled()) {
return {
messages,
tools,
compressed: false,
stats: { skipped: true, reason: "disabled" },
};
}
// Estimate tokens - skip if below threshold
const estimatedTokens = estimateTokens(messages);
if (estimatedTokens < headroomConfig.minTokens) {
metrics.skippedCompressions++;
return {
messages,
tools,
compressed: false,
stats: {
skipped: true,
reason: `Below threshold (${estimatedTokens} < ${headroomConfig.minTokens})`,
},
};
}
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), headroomConfig.timeoutMs);
const response = await fetch(`${headroomConfig.endpoint}/compress`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
messages,
tools,
model: options.model || "claude-3-5-sonnet-20241022",
model_limit: options.modelLimit || 200000,
mode: options.mode || headroomConfig.mode,
token_budget: options.tokenBudget,
query_context: options.queryContext,
preserve_recent_turns: options.preserveRecentTurns,
target_ratio: options.targetRatio,
}),
signal: controller.signal,
});
clearTimeout(timeout);
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Headroom returned ${response.status}: ${errorText}`);
}
const result = await response.json();
// Update metrics
if (result.compressed) {
metrics.successfulCompressions++;
metrics.totalTokensSaved += result.stats?.tokens_saved || 0;
metrics.totalLatencyMs += result.stats?.latency_ms || 0;
logger.info(
{
tokensBefore: result.stats?.tokens_before,
tokensAfter: result.stats?.tokens_after,
savingsPercent: result.stats?.savings_percent,
compressionRatio: result.stats?.compression_ratio,
latencyMs: result.stats?.latency_ms,
transforms: result.stats?.transforms_applied,
headroomVersion: result.stats?.headroom_version,
},
"Headroom compression applied"
);
} else {
metrics.skippedCompressions++;
logger.debug({ reason: result.stats?.reason }, "Headroom compression skipped");
}
return {
messages: result.messages,
tools: result.tools,
compressed: result.compressed,
stats: result.stats,
};
} catch (err) {
metrics.failures++;
if (err.name === "AbortError") {
logger.warn({ timeoutMs: headroomConfig.timeoutMs }, "Headroom compression timed out");
} else {
logger.warn({ error: err.message }, "Headroom compression failed, using original");
}
return {
messages,
tools,
compressed: false,
stats: { skipped: true, reason: err.message },
};
}
}
/**
* Retrieve original content from CCR store
*
* @param {string} hash - Hash key from compression marker
* @param {string} query - Optional search query to filter results
* @param {number} maxResults - Maximum results for search (default 20)
* @returns {Object} { success, content, itemsRetrieved, wasSearch, error }
*/
async function ccrRetrieve(hash, query = null, maxResults = 20) {
const headroomConfig = getConfig();
if (!isEnabled()) {
return { success: false, error: "Headroom disabled" };
}
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), headroomConfig.timeoutMs);
const response = await fetch(`${headroomConfig.endpoint}/ccr/retrieve`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ hash, query, max_results: maxResults }),
signal: controller.signal,
});
clearTimeout(timeout);
if (!response.ok) {
throw new Error(`CCR retrieve returned ${response.status}`);
}
const result = await response.json();
if (result.success) {
if (result.was_search) {
metrics.ccrSearches++;
logger.debug({ hash, query, items: result.items_retrieved }, "CCR search completed");
} else {
metrics.ccrRetrievals++;
logger.debug({ hash, items: result.items_retrieved }, "CCR retrieval completed");
}
}
return {
success: result.success,
content: result.content,
itemsRetrieved: result.items_retrieved || 0,
wasSearch: result.was_search || false,
error: result.error,
};
} catch (err) {
logger.error({ error: err.message, hash }, "CCR retrieval failed");
return { success: false, error: err.message };
}
}
/**
* Get client-side metrics
*/
function getMetrics() {
return {
...metrics,
averageLatencyMs:
metrics.successfulCompressions > 0
? Math.round(metrics.totalLatencyMs / metrics.successfulCompressions)
: 0,
compressionRate:
metrics.totalCalls > 0
? Math.round((metrics.successfulCompressions / metrics.totalCalls) * 100)
: 0,
failureRate:
metrics.totalCalls > 0 ? Math.round((metrics.failures / metrics.totalCalls) * 100) : 0,
};
}
/**
* Get server-side metrics from sidecar
*/
async function getServerMetrics() {
const headroomConfig = getConfig();
if (!isEnabled()) {
return null;
}
try {
const response = await fetch(`${headroomConfig.endpoint}/metrics`, {
signal: AbortSignal.timeout(2000),
});
if (response.ok) {
return await response.json();
}
return null;
} catch (err) {
logger.debug({ error: err.message }, "Failed to fetch server metrics");
return null;
}
}
/**
* Get combined metrics (client + server)
*/
async function getCombinedMetrics() {
const clientMetrics = getMetrics();
const serverMetrics = await getServerMetrics();
return {
enabled: isEnabled(),
endpoint: getConfig().endpoint,
client: clientMetrics,
server: serverMetrics,
};
}
/**
* Reset client-side metrics
*/
function resetMetrics() {
Object.keys(metrics).forEach((key) => {
metrics[key] = 0;
});
}
module.exports = {
isEnabled,
checkHealth,
compressMessages,
ccrRetrieve,
getMetrics,
getServerMetrics,
getCombinedMetrics,
resetMetrics,
estimateTokens,
};