UNPKG

@aj-archipelago/cortex

Version:

Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.

420 lines (362 loc) 18.9 kB
import Bottleneck from 'bottleneck/es5.js'; import RequestMonitor from './requestMonitor.js'; import { config } from '../config.js'; import axios from 'axios'; import { setupCache } from 'axios-cache-interceptor'; import Redis from 'ioredis'; import logger from './logger.js'; import { v4 as uuidv4 } from 'uuid'; const connectionString = config.get('storageConnectionString'); if (!connectionString) { logger.info('No STORAGE_CONNECTION_STRING found in environment. Redis features (caching, pubsub, clustered limiters) disabled.') } else { logger.info('Using Redis connection specified in STORAGE_CONNECTION_STRING.'); } let client; if (connectionString) { try { client = new Redis(connectionString); } catch (error) { logger.error(`Redis connection error: ${error}`); } } const cortexId = config.get('cortexId'); const connection = client && new Bottleneck.IORedisConnection({ client: client }); let modelEndpoints = {}; const createLimiter = (endpoint, name, index) => { const rps = endpoint.requestsPerSecond ?? 100; let limiterOptions = { minTime: 1000 / rps, maxConcurrent: rps, reservoir: rps, // Number of tokens available initially reservoirRefreshAmount: rps, // Number of tokens added per interval reservoirRefreshInterval: 1000, // Interval in milliseconds }; // If Redis connection exists, add id and connection to enable clustering if (connection) { limiterOptions.id = `${cortexId}-${name}-${index}-limiter`; // Unique id for each limiter limiterOptions.connection = connection; // Shared Redis connection } endpoint.limiter = new Bottleneck(limiterOptions); endpoint.limiter.on('error', (err) => { logger.error(`Limiter error for ${cortexId}-${name}-${index}: ${err}`); endpoint.limiter.disconnect(); createLimiter(endpoint, name, index); logger.info(`New limiter created for ${cortexId}-${name}-${index}`) }); endpoint.limiter.on('failed', (error, info) => { if (error.name === 'CanceledError') { logger.debug(`Limiter request cancelled for ${cortexId}-${name}-${index}: Id: ${info.options.id || 'none'}`); endpoint.monitor.incrementErrorCount(); } else { logger.error(`Limiter request failed for ${cortexId}-${name}-${index}: Id: ${info.options.id || 'none'}: ${error?.message || error}`); } }); endpoint.limiter.on('debug', (message) => { if (!message.includes('heartbeat.lua')) { logger.debug(`Limiter ${cortexId}-${name}-${index}: ${message}`); } }); } const buildModelEndpoints = (config) => { modelEndpoints = JSON.parse(JSON.stringify(config.get('models'))); logger.info(`Building ${connection ? 'Redis clustered' : 'local'} model rate limiters for ${cortexId}...`); for (const [name, model] of Object.entries(modelEndpoints)) { model.endpoints.forEach((endpoint, index) => { createLimiter(endpoint, name, index) endpoint.monitor = new RequestMonitor(); }); } } let currentIndex = 0; // for round-robin selection const selectEndpoint = (model) => { if (!model || !Array.isArray(model.endpoints) || model.endpoints.length === 0) { return null; } else { logger.debug(`Selecting endpoint for model ${model.name}...`); if (model.endpoints.length === 1) { logger.debug(`Only one endpoint for model ${model.name}. No selection required.`); return model.endpoints[0]; } let healthyEndpoints = model.endpoints.filter(endpoint => endpoint.monitor.healthy); if (healthyEndpoints.length === 0) { const selectedEndpoint = model.endpoints[currentIndex % model.endpoints.length]; currentIndex++; logger.warn(`No healthy endpoints for model ${model.name}. Using round-robin selection. Selected: ${selectedEndpoint.name || 'default'}`); return selectedEndpoint; } healthyEndpoints.forEach(endpoint =>{ logger.debug(`Healthy endpoint: ${endpoint.name || 'default'}, duration: ${endpoint.monitor.getAverageCallDuration()}ms`); }) let selectedEndpoint; const durations = healthyEndpoints.map(endpoint => endpoint.monitor.getAverageCallDuration()); if (shouldUseRoundRobin(durations)) { selectedEndpoint = healthyEndpoints[currentIndex % healthyEndpoints.length]; currentIndex++; logger.debug(`All endpoints are performing similarly. Using round-robin selection. Selected: ${selectedEndpoint.name || 'default'}`); } else { selectedEndpoint = fastestEndpoint(healthyEndpoints); logger.debug(`Selected fastest endpoint: ${selectedEndpoint.name || 'default'}`); } return selectedEndpoint; } } const calculateStandardDeviation = (durations) => { const mean = durations.reduce((total, value) => total + value, 0) / durations.length; const variance = durations.reduce((total, value) => total + Math.pow(value - mean, 2), 0) / durations.length; return Math.sqrt(variance); } const shouldUseRoundRobin = (durations) => { const standardDeviation = calculateStandardDeviation(durations); const threshold = 10; return standardDeviation <= threshold; } const fastestEndpoint = (endpoints) => { return endpoints.reduce((fastest, current) => { if (current.monitor.getAverageCallDuration() < fastest.monitor.getAverageCallDuration()) { return current; } else { return fastest; } }); } let cortexAxios = axios; if (config.get('enableCache')) { // Setup cache cortexAxios = setupCache(axios, { // enable cache for all requests by default methods: ['get', 'post', 'put', 'delete', 'patch'], interpretHeader: false, ttl: 1000 * 60 * 60 * 24 * 7, // 7 days }); } //log statistics about active endpoints setInterval(() => { // Iterate over each model for (const [name, model] of Object.entries(modelEndpoints)) { // Iterate over each endpoint in the current model let endpointIndex = 0; model.endpoints.forEach((endpoint) => { const monitor = endpoint.monitor; if (!monitor) { // Skip if monitor does not exist return; } const callRate = monitor.getPeakCallRate(); if (callRate > 0) { const error429Rate = monitor.getError429Rate(); const errorRate = monitor.getErrorRate(); const avgCallDuration = monitor.getAverageCallDuration(); logger.debug('------------------------'); logger.debug(`Monitor of ${name} endpoint ${endpoint.name || endpointIndex} Call rate: ${callRate} calls/sec, duration: ${avgCallDuration}ms, 429 errors: ${error429Rate * 100}%, errors: ${errorRate * 100}%`); logger.debug('------------------------'); } endpointIndex++; }); } }, 30000); // Log rates every 30 seconds const requestWithMonitor = async (endpoint, url, data, axiosConfigObj) => { //logger.warn(`Requesting ${url} with data: ${JSON.stringify(data)}`); const callId = endpoint?.monitor?.startCall(); let response; try { if (axiosConfigObj?.method == 'GET'){ logger.debug(`Getting ${url} with data: ${JSON.stringify(data)}`); response = await cortexAxios.get(url, axiosConfigObj); } else { logger.debug(`Posting ${url} with data: ${JSON.stringify(data)}`); response = await cortexAxios.post(url, data, axiosConfigObj); } } catch (error) { // throw new error with duration as part of the error data const { code, name } = error; const finalStatus = error?.response?.status ?? error?.status const statusText = error?.response?.statusText ?? error?.statusText const errorMessage = error?.response?.data?.message ?? error?.response?.data?.error?.message ?? error?.message ?? String(error); throw { code, message: errorMessage, status: finalStatus, statusText, name, duration: endpoint?.monitor?.incrementErrorCount(callId, finalStatus) }; } let duration; if (response.status >= 200 && response.status < 300) { duration = endpoint?.monitor?.endCall(callId); } else { duration = endpoint?.monitor?.incrementErrorCount(callId, response.status); } return { response, duration }; } const MAX_RETRY = 6; // retries for error handling const MAX_DUPLICATE_REQUESTS = 3; // duplicate requests to manage latency spikes const DUPLICATE_REQUEST_AFTER = 10; // 10 seconds const getDuplicateRequestDelay = (index, duplicateRequestAfter) => { const duplicateRequestTime = duplicateRequestAfter * Math.pow(2, index) - duplicateRequestAfter; const jitter = duplicateRequestTime * 0.2 * Math.random(); const duplicateRequestTimeout = Math.max(0, duplicateRequestTime + jitter); return duplicateRequestTimeout; } const makeRequest = async (cortexRequest) => { let promises = []; // retry certain errors up to MAX_RETRY times for (let i = 0; i < MAX_RETRY; i++) { const { url, data, params, headers, cache, selectedEndpoint, requestId, pathway, model, stream, method} = cortexRequest; const enableDuplicateRequests = pathway?.enableDuplicateRequests !== undefined ? pathway.enableDuplicateRequests : config.get('enableDuplicateRequests'); const maxDuplicateRequests = enableDuplicateRequests ? MAX_DUPLICATE_REQUESTS : 1; const duplicateRequestAfter = (pathway?.duplicateRequestAfter || DUPLICATE_REQUEST_AFTER) * 1000; const axiosConfigObj = { params, headers, cache, method }; const streamRequested = (stream || params?.stream || data?.stream); // if we're using streaming, duplicate requests are // not supported, so we just push one promise into the array if (streamRequested && model.supportsStreaming) { axiosConfigObj.responseType = 'stream'; promises.push(selectedEndpoint.limiter.schedule({expiration: pathway.timeout * 1000 + 1000, id: `${requestId}_${uuidv4()}`},() => requestWithMonitor(selectedEndpoint, url, data, axiosConfigObj))); } else { if (streamRequested) { logger.info(`>>> [${requestId}] ${model.name || 'This model'} does not support streaming - sending non-streaming request`); axiosConfigObj.params.stream = false; data.stream = false; } // if we're not streaming, we push at least one promise // into the array, but if we're supporting duplicate // requests we push one for each potential duplicate, // heading to a new endpoint (if available) and // staggered by a jittered amount of time const controllers = Array.from({ length: maxDuplicateRequests }, () => new AbortController()); promises = controllers.map((controller, index) => new Promise((resolve, reject) => { setTimeout(async () => { try { if (index > 0) { cortexRequest.selectNewEndpoint(); } const { url, data, params, headers, cache, selectedEndpoint, requestId, pathway, model } = cortexRequest; const endpointName = selectedEndpoint.name || model; if (!selectedEndpoint.limiter) { throw new Error(`No limiter for endpoint ${endpointName}!`); } const axiosConfigObj = { params, headers, cache, method }; let response = null; let duration = null; if (!controller.signal?.aborted) { axiosConfigObj.signal = controller.signal; axiosConfigObj.headers['X-Cortex-Request-Index'] = index; if (index > 0) { const logMessage = `>>> [${requestId}] taking too long - sending duplicate request ${index} to ${endpointName} API`; const header = '>'.repeat(logMessage.length); logger.info(`\n${header}\n${logMessage}`); } ({ response, duration } = await selectedEndpoint.limiter.schedule({expiration: pathway.timeout * 1000 + 1000, id: `${requestId}_${uuidv4()}`}, () => requestWithMonitor(selectedEndpoint, url, data, axiosConfigObj))); if (!controller.signal?.aborted) { logger.verbose(`<<< [${requestId}] received response for request ${index}`); } } resolve({ response, duration }); } catch (error) { if (error.name === 'AbortError' || error.name === 'CanceledError') { //logger.info(`XXX [${requestId}] request ${index} was cancelled`); reject(error); } else { logger.error(`!!! [${requestId}] request ${index} failed with error: ${error?.response?.data?.message || error?.response?.data?.error?.message || error?.message || error}`); reject(error); } } finally { controllers.forEach(controller => controller.abort()); } }, getDuplicateRequestDelay(index, duplicateRequestAfter)); }) ); } // no requests have been made yet, but the promises array // is full, so now we execute them in parallel try { const { response, duration } = await Promise.race(promises); // if response status is 2xx if (response?.status >= 200 && response?.status < 300) { return { response, duration }; } else { const error = new Error(`Request failed with status ${response?.status}`); error.response = response; error.duration = duration; throw error; } } catch (error) { // Handle both cases: error with response object and direct error object const status = error?.response?.status || error?.status || 502; // default to 502 if no status const duration = error?.duration; const response = error?.response || {error: error}; // Calculate backoff time - use Retry-After for 429s if available let backoffTime = 1000 * Math.pow(2, i); if (status === 429 && (response?.headers?.['retry-after'] || error?.headers?.['retry-after'])) { backoffTime = parseInt(response?.headers?.['retry-after'] || error?.headers?.['retry-after']) * 1000; logger.warn(`>>> [${requestId}] Rate limited (429). Retry-After: ${response?.headers?.['retry-after'] || error?.headers?.['retry-after']}s`); } const jitter = backoffTime * 0.2 * Math.random(); // if there is only one endpoint, only retry select error codes if (cortexRequest.model.endpoints.length === 1) { if (status !== 429 && status !== 408 && status !== 500 && status !== 502 && status !== 503 && status !== 504) { return { response, duration }; } // set up for a retry by reinitializing the request cortexRequest.initRequest(); } else { // if there are multiple endpoints, retry everything by default // as it could be a temporary issue with one endpoint // certain errors (e.g. 400) are problems with the request itself // and should not be retried if (status == 400 || status == 413) { return { response, duration }; } // set up for a retry by selecting a new endpoint, which will also reinitialize the request cortexRequest.selectNewEndpoint(); } if (i < MAX_RETRY - 1) { logger.info(`>>> [${requestId}] retrying request due to ${status} response. Retry count: ${i + 1}. Retrying in ${backoffTime + jitter}ms`); await new Promise(r => setTimeout(r, backoffTime + jitter)); } else { return { response, duration }; } } } }; const executeRequest = async (cortexRequest) => { try { const result = await makeRequest(cortexRequest); // Validate that we have a result and it contains response if (!result || !result.response) { throw new Error('No response received from request'); } const { response, duration } = result; const requestId = cortexRequest.requestId; // Validate response object if (!response || typeof response !== 'object') { throw new Error('Invalid response object received'); } const { data, cached } = response; if (cached) { logger.info(`<<< [${requestId}] served with cached response.`); } // Check for error object in the response if (response.error) { throw new Error(response.error.message || JSON.stringify(response.error)); } // Check for HTTP error status if (response.status >= 400) { const errorMessage = response.data?.error?.message || response.statusText; const errorDetails = response.data ? `\nResponse data: ${JSON.stringify(response.data)}` : ''; throw new Error(`HTTP error: ${response.status} ${errorMessage}${errorDetails}`); } return { data, duration }; } catch (error) { // Add context to the error const requestId = cortexRequest?.requestId || 'unknown'; const model = cortexRequest?.model?.name || 'unknown'; const errorMessage = error.message || 'Unknown error occurred'; logger.error(`Error in executeRequest for ${model} (requestId: ${requestId}): ${errorMessage}`); throw error; } } export { axios, executeRequest, buildModelEndpoints, selectEndpoint, modelEndpoints, createLimiter };