UNPKG

revenium-middleware-litellm-node

Version:

Comprehensive middleware for Node.js applications using LiteLLM Proxy to automatically track LLM usage, costs, and performance metrics with Revenium

github.com/revenium-io/revenium-middleware-litellm-node

revenium-io/revenium-middleware-litellm-node

572 lines • 21.6 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.patchHttpClient = patchHttpClient; exports.unpatchHttpClient = unpatchHttpClient; exports.isHttpClientPatched = isHttpClientPatched; exports.resetHttpClientManager = resetHttpClientManager; const config_1 = require("./config"); const tracking_1 = require("./tracking"); const crypto_1 = require("crypto"); const validation_1 = require("./utils/validation"); const error_handling_1 = require("./utils/error-handling"); const constants_1 = require("./constants"); // Global logger const logger = (0, config_1.getLogger)(); /** * HTTP client manager singleton for proper state management */ class HttpClientManager { constructor() { this.isPatched = false; this.originalFetch = null; // Private constructor to enforce singleton pattern } /** * Get the singleton instance */ static getInstance() { if (!HttpClientManager.instance) { HttpClientManager.instance = new HttpClientManager(); } return HttpClientManager.instance; } /** * Reset the singleton instance (for testing) */ static resetInstance() { HttpClientManager.instance = null; } /** * Check if HTTP client is patched */ isHttpClientPatched() { return this.isPatched; } /** * Get the original fetch function */ getOriginalFetch() { return this.originalFetch; } /** * Set the patched state and store original fetch */ setPatched(patched, originalFetch) { this.isPatched = patched; if (originalFetch) this.originalFetch = originalFetch; } /** * Reset to unpatched state (for testing) */ reset() { this.isPatched = false; this.originalFetch = null; } } HttpClientManager.instance = null; /** * Check if a URL is a LiteLLM Proxy endpoint (chat completions or embeddings) */ function isLiteLLMProxyRequest(url, config) { try { const requestUrl = new URL(url); const proxyUrl = new URL(config.litellmProxyUrl); // Check if the request is going to our configured LiteLLM Proxy const isSameHost = requestUrl.hostname === proxyUrl.hostname; const isSamePort = requestUrl.port === proxyUrl.port || (requestUrl.port === '80' || requestUrl.port === '443') && proxyUrl.port === ''; // Handle two cases: // 1. Proxy URL is a base URL (e.g., http://localhost:4000) - check if request is to supported endpoint // 2. Proxy URL is a full endpoint URL (e.g., http://localhost:4000/chat/completions) - check exact match let isCorrectEndpoint = false; if (constants_1.supportedEndpoints.some(endpoint => proxyUrl.pathname.endsWith(endpoint))) { // Case 2: Proxy URL includes the endpoint path - check exact path match isCorrectEndpoint = requestUrl.pathname === proxyUrl.pathname; } else { // Case 1: Proxy URL is base URL - check if request is to any supported endpoint isCorrectEndpoint = constants_1.supportedEndpoints.some(endpoint => requestUrl.pathname.endsWith(endpoint)); } return isSameHost && isSamePort && isCorrectEndpoint; } catch (error) { return false; } } /** * Create patched fetch function that intercepts LiteLLM Proxy requests */ function createPatchedFetch() { return async function patchedFetch(input, init) { const config = (0, config_1.getConfig)(); // Convert input to URL string for checking const url = typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url; // Only intercept LiteLLM Proxy requests if we have config if (!config || !isLiteLLMProxyRequest(url, config)) { const clientManager = HttpClientManager.getInstance(); const originalFetchFn = clientManager.getOriginalFetch(); if (!originalFetchFn) throw new Error('Original fetch function not available'); return originalFetchFn(input, init); } // Validate the URL against our proxy configuration const urlValidation = (0, validation_1.validateLiteLLMUrl)(url, config.litellmProxyUrl); if (!urlValidation.isValid) { logger.warn('Invalid LiteLLM proxy URL detected', { url, errors: urlValidation.errors, configuredProxy: config.litellmProxyUrl }); // Continue with original fetch for invalid URLs const clientManager = HttpClientManager.getInstance(); const originalFetchFn = clientManager.getOriginalFetch(); if (!originalFetchFn) throw new Error('Original fetch function not available'); return originalFetchFn(input, init); } // Extract and validate request context const rawHeaders = init?.headers ? Object.fromEntries(new Headers(init.headers)) : {}; const validatedHeaders = (0, validation_1.validateHeaders)(rawHeaders); const requestContext = { url, method: init?.method || 'GET', headers: validatedHeaders, body: init?.body || null, startTime: Date.now(), metadata: (0, tracking_1.extractMetadataFromHeaders)(validatedHeaders) }; const requestId = (0, crypto_1.randomUUID)(); logger.debug('Intercepted LiteLLM Proxy request', { url: requestContext.url, method: requestContext.method, requestId, hasMetadata: !!requestContext.metadata }); try { // Add LiteLLM Proxy authentication if configured const headers = new Headers(init?.headers); if (config.litellmApiKey) headers.set('Authorization', `Bearer ${config.litellmApiKey}`); // Make the actual request const clientManager = HttpClientManager.getInstance(); const originalFetchFn = clientManager.getOriginalFetch(); if (!originalFetchFn) throw new Error('Original fetch function not available'); const response = await originalFetchFn(input, { ...init, headers }); const endTime = Date.now(); const duration = endTime - requestContext.startTime; // Clone response to read body without consuming it const responseClone = response.clone(); logger.debug('LiteLLM Proxy response received', { status: response.status, requestId, duration }); // Handle successful chat completion responses if (response.ok && requestContext.method === 'POST') { handleSuccessfulResponse(requestContext, response, responseClone, requestId, duration); } else if (!response.ok) { logger.warn('LiteLLM Proxy request failed', { status: response.status, statusText: response.statusText, requestId }); } return response; } catch (error) { const endTime = Date.now(); const duration = endTime - requestContext.startTime; logger.error('LiteLLM Proxy request error', { error: error instanceof Error ? error.message : String(error), requestId, duration }); throw error; } }; } /** * Parse request body for either chat completions or embeddings */ async function parseRequestBody(requestContext, requestId, endpointType) { if (!requestContext.body) return null; try { const bodyText = typeof requestContext.body === 'string' ? requestContext.body : await new Response(requestContext.body).text(); return JSON.parse(bodyText); } catch (error) { logger.warn(`Failed to parse ${endpointType} request body`, { requestId, error: error instanceof Error ? error.message : String(error) }); return null; } } /** * Handle successful chat completion response and track usage */ async function handleSuccessfulResponse(requestContext, originalResponse, responseClone, requestId, duration) { try { // Determine endpoint type from URL const url = new URL(requestContext.url); const isEmbeddingsEndpoint = url.pathname.endsWith('/embeddings') || url.pathname.endsWith('/v1/embeddings'); if (isEmbeddingsEndpoint) { // Handle embeddings request const requestData = await parseRequestBody(requestContext, requestId, 'embeddings'); const model = requestData?.model || 'unknown'; await handleEmbeddingResponse(responseClone, requestContext, requestId, duration, model); } else { // Handle chat completions request const requestData = await parseRequestBody(requestContext, requestId, 'chat'); const isStreaming = requestData?.stream === true; const model = requestData?.model || 'unknown'; if (isStreaming) { await handleStreamingResponse(responseClone, requestContext, requestId, duration, model); } else { handleNonStreamingResponse(responseClone, requestContext, requestId, duration, model); } } } catch (error) { logger.error('Error handling LiteLLM response', { error: error instanceof Error ? error.message : String(error), requestId }); } } /** * Handle non-streaming chat completion response */ async function handleNonStreamingResponse(response, requestContext, requestId, duration, model) { try { const responseData = await response.json(); const usage = (0, tracking_1.extractUsageFromResponse)(responseData); logger.debug('Extracted usage from non-streaming response', { requestId, model, ...usage }); // Track usage asynchronously (0, tracking_1.trackUsageAsync)({ requestId, model, promptTokens: usage.promptTokens, completionTokens: usage.completionTokens, totalTokens: usage.totalTokens, duration, finishReason: usage.finishReason, usageMetadata: requestContext.metadata, isStreamed: false }); } catch (error) { logger.error('Error processing non-streaming response', { error: error instanceof Error ? error.message : String(error), requestId }); } } /** * Handle streaming chat completion response */ async function handleStreamingResponse(response, requestContext, requestId, duration, model) { logger.debug('Processing streaming response', { requestId, model }); if (!response.body) { logger.warn('Streaming response has no body', { requestId }); // Track with minimal data if no body (0, tracking_1.trackUsageAsync)({ requestId, model, promptTokens: 0, completionTokens: 0, totalTokens: 0, duration, finishReason: 'stop', usageMetadata: requestContext.metadata, isStreamed: true, timeToFirstToken: duration }); return; } try { const streamParser = new StreamingResponseParser(requestId, model, requestContext, duration); await streamParser.parseStream(response.body); } catch (error) { logger.error('Error parsing streaming response', { error: error instanceof Error ? error.message : String(error), requestId }); // Track with minimal data on error (0, tracking_1.trackUsageAsync)({ requestId, model, promptTokens: 0, completionTokens: 0, totalTokens: 0, duration, finishReason: 'error', usageMetadata: requestContext.metadata, isStreamed: true, timeToFirstToken: duration }); } } /** * Handle embeddings response and track usage */ async function handleEmbeddingResponse(response, requestContext, requestId, duration, model) { try { const responseData = await response.json(); const usage = responseData.usage; logger.debug('Extracted usage from embeddings response', { requestId, model, promptTokens: usage.prompt_tokens, totalTokens: usage.total_tokens }); // Track embeddings usage asynchronously (0, tracking_1.trackEmbeddingsUsageAsync)({ requestId, model, promptTokens: usage.prompt_tokens, totalTokens: usage.total_tokens, duration, usageMetadata: requestContext.metadata }); } catch (error) { logger.error('Error processing embeddings response', { error: error instanceof Error ? error.message : String(error), requestId }); } } /** * Streaming response parser for LiteLLM SSE streams */ class StreamingResponseParser { constructor(requestId, model, requestContext, requestDuration) { this.firstTokenTime = null; this.promptTokens = 0; this.completionTokens = 0; this.totalTokens = 0; this.finishReason = null; this.logger = (0, config_1.getLogger)(); this.requestId = requestId; this.model = model; this.requestContext = requestContext; this.requestDuration = requestDuration; this.startTime = Date.now(); } async parseStream(body) { const reader = body.getReader(); const decoder = new TextDecoder(); let buffer = ''; try { while (true) { const { done, value } = await reader.read(); if (done) break; // Decode chunk and add to buffer buffer += decoder.decode(value, { stream: true }); // Process complete SSE messages const lines = buffer.split('\n'); buffer = lines.pop() || ''; // Keep incomplete line in buffer for (const line of lines) { this.processSSELine(line); } } // Process any remaining buffer content if (buffer.trim()) { this.processSSELine(buffer); } } finally { reader.releaseLock(); this.finalizeTracking(); } } processSSELine(line) { const trimmed = line.trim(); // Skip empty lines and comments if (!trimmed || trimmed.startsWith(':')) return; // Parse SSE data lines if (trimmed.startsWith('data: ')) { const data = trimmed.slice(6); // Remove 'data: ' prefix // Check for stream end marker if (data === '[DONE]') { this.logger.debug('Stream completed', { requestId: this.requestId }); return; } try { const chunk = JSON.parse(data); this.processStreamChunk(chunk); } catch (error) { this.logger.debug('Failed to parse stream chunk', { requestId: this.requestId, data: data.substring(0, 100) }); } } } processStreamChunk(chunk) { // Validate and sanitize chunk data const validatedChunk = (0, validation_1.validateStreamChunk)(chunk); if (!validatedChunk) { this.logger.debug('Invalid stream chunk received', { requestId: this.requestId, chunkType: typeof chunk }); return; } // Record first token time if (this.firstTokenTime === null && validatedChunk.choices?.[0]?.delta?.content) { this.firstTokenTime = Date.now(); this.logger.debug('First token received', { requestId: this.requestId, timeToFirstToken: this.firstTokenTime - this.startTime }); } // Extract usage information (typically in the last chunk) if (validatedChunk.usage) { this.promptTokens = validatedChunk.usage.prompt_tokens || 0; this.completionTokens = validatedChunk.usage.completion_tokens || 0; this.totalTokens = validatedChunk.usage.total_tokens || 0; this.logger.debug('Usage data extracted from stream', { requestId: this.requestId, promptTokens: this.promptTokens, completionTokens: this.completionTokens, totalTokens: this.totalTokens }); } // Extract finish reason if (validatedChunk.choices?.[0]?.finish_reason) this.finishReason = validatedChunk.choices[0].finish_reason; // Some providers send usage in different chunk structures if (!this.totalTokens && validatedChunk.x_groq?.usage) { // Groq-specific usage format this.promptTokens = validatedChunk.x_groq.usage.prompt_tokens || 0; this.completionTokens = validatedChunk.x_groq.usage.completion_tokens || 0; this.totalTokens = validatedChunk.x_groq.usage.total_tokens || 0; } } finalizeTracking() { const timeToFirstToken = this.firstTokenTime ? (this.firstTokenTime - this.startTime) : this.requestDuration; this.logger.debug('Finalizing streaming response tracking', { requestId: this.requestId, model: this.model, promptTokens: this.promptTokens, completionTokens: this.completionTokens, totalTokens: this.totalTokens, finishReason: this.finishReason, timeToFirstToken }); // Track the complete streaming response (0, tracking_1.trackUsageAsync)({ requestId: this.requestId, model: this.model, promptTokens: this.promptTokens, completionTokens: this.completionTokens, totalTokens: this.totalTokens, duration: this.requestDuration, finishReason: this.finishReason || 'stop', usageMetadata: this.requestContext.metadata, isStreamed: true, timeToFirstToken }); } } /** * Patch the global fetch function to intercept LiteLLM Proxy requests */ function patchHttpClient() { const clientManager = HttpClientManager.getInstance(); if (clientManager.isHttpClientPatched()) { logger.debug('HTTP client already patched'); return true; } if (typeof globalThis.fetch !== 'function') { const errorContext = (0, error_handling_1.createErrorContext)() .with('fetchType', typeof globalThis.fetch) .build(); logger.error('Global fetch function not available', errorContext); return false; } try { // Store original fetch const originalFetch = globalThis.fetch; clientManager.setPatched(false, originalFetch); // Replace with patched version globalThis.fetch = createPatchedFetch(); clientManager.setPatched(true); logger.info('LiteLLM HTTP client middleware enabled'); return true; } catch (error) { const errorContext = (0, error_handling_1.createErrorContext)() .with('error', error instanceof Error ? error.message : String(error)) .with('stack', error instanceof Error ? error.stack : undefined) .build(); logger.error('Failed to patch HTTP client', errorContext); // Throw a proper error for better debugging throw new error_handling_1.PatchingError('Failed to patch HTTP client for LiteLLM interception', errorContext); } } /** * Restore the original fetch function */ function unpatchHttpClient() { const clientManager = HttpClientManager.getInstance(); if (!clientManager.isHttpClientPatched()) { logger.debug('HTTP client not patched'); return true; } const originalFetch = clientManager.getOriginalFetch(); if (!originalFetch) { logger.error('Original fetch function not stored'); return false; } try { globalThis.fetch = originalFetch; clientManager.setPatched(false); logger.info('LiteLLM HTTP client middleware disabled'); return true; } catch (error) { logger.error('Failed to unpatch HTTP client', { error: error instanceof Error ? error.message : String(error) }); return false; } } /** * Check if HTTP client is patched */ function isHttpClientPatched() { return HttpClientManager.getInstance().isHttpClientPatched(); } /** * Reset HTTP client manager (for testing) */ function resetHttpClientManager() { HttpClientManager.resetInstance(); } //# sourceMappingURL=client.js.map