cmte
Version:
Design by Committee™ except it's just you and LLMs
322 lines (291 loc) • 14.1 kB
JavaScript
import { logger } from '../../utils/logger.js';
export class BaseLLMClient {
maxRetries = parseInt(process.env.LLM_MAX_RETRIES || '20', 10);
baseDelay = 1000; // 1 second
maxExponentialDelay = parseInt(process.env.LLM_MAX_RETRY_DELAY_MS || '300000', 10);
maxParallelRequests = parseInt(process.env.MAX_PARALLEL_REQUESTS || '10', 10); // Default to 10, configurable via env
// Read min delay in seconds from env, default to 100ms if not set or invalid
_minInterRequestDelayMs = (() => {
const delaySeconds = parseFloat(process.env.REQUEST_DELAY_SECONDS);
if (!isNaN(delaySeconds) && delaySeconds >= 0) {
return Math.round(delaySeconds * 1000); // Convert to ms
}
return 100; // Default to 100ms
})();
_maxInterRequestDelayMs = 45000; // Max delay (currently set on rate limit)
constructor(config) {
this.config = config;
// Initialize queue for managing parallel requests
this.activeRequests = 0;
this.requestQueue = [];
// Initialize current limit to the configured maximum
this._currentConcurrencyLimit = this.maxParallelRequests;
this._increaseConcurrencyTimeout = null; // Timeout handle for gradual increase
// Start with the configured (or default) minimum delay
this._interRequestDelayMs = this._minInterRequestDelayMs;
logger.debug(`Initialized LLM client with max retries: ${this.maxRetries}, max parallel: ${this.maxParallelRequests}, current limit: ${this._currentConcurrencyLimit}, min request delay: ${this._minInterRequestDelayMs}ms, initial delay: ${this._interRequestDelayMs}ms, max exponential delay: ${this.maxExponentialDelay}ms`);
}
/**
* Handles streaming response from LLM providers
* @param response Fetch Response object containing a stream
* @returns Concatenated response text
*/
async handleStreamingResponse(response) {
if (!response.body) {
throw new Error('Response body is null');
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let responseText = '';
try {
while (true) {
const {
done,
value
} = await reader.read();
if (done) break;
const chunk = decoder.decode(value, {
stream: true
});
responseText += chunk;
}
} finally {
reader.releaseLock();
}
return responseText;
}
/**
* Executes a function with exponential backoff retry logic
* @param fn Function to execute
* @returns Result of the function
*/
async withExponentialBackoff(fn) {
let lastError = null;
// Log the maxRetries value being used for this execution
logger.debug(`[Retry] Entering withExponentialBackoff. Max retries set to: ${this.maxRetries}`);
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
try {
// Attempt the function call
return await fn();
} catch (error) {
lastError = error;
let specificRetryAfterMs = null;
let isHttpClientError = false;
let isRateLimitError = false;
let httpStatusCode = null;
// --- Identify Error Type ---
if (error && typeof error === 'object') {
if (typeof error.status === 'number') { // Check common property for SDK errors
httpStatusCode = error.status;
isHttpClientError = true;
if (httpStatusCode === 429) isRateLimitError = true;
} else if (error instanceof Response) { // Check for raw fetch Response
httpStatusCode = error.status;
isHttpClientError = true;
if (httpStatusCode === 429) isRateLimitError = true;
}
}
// --- Handle Non-Retriable Client Errors (4xx excluding 429) ---
if (isHttpClientError && httpStatusCode >= 400 && httpStatusCode < 500 && !isRateLimitError) {
logger.error(`Client error ${httpStatusCode}. Not retrying.`, { error });
throw error;
}
// --- Handle Rate Limit Error (429) ---
if (isRateLimitError) {
logger.warn(`Rate limit exceeded (429). Checking for Retry-After header.`);
// <<< Call the handler to reduce concurrency >>>
this._handleRateLimitHit();
try {
// ** Look for headers nested under error.error.headers first **
let headers = null;
if (error.error?.headers) { // Check nested structure first based on logs
headers = error.error.headers;
} else if (error.response?.headers) { // Fallback check common for SDKs
headers = error.response.headers;
} else if (error.headers) { // Fallback for raw Response or other error types
headers = error.headers;
}
// Need to check if headers is a plain object (from SDK error) or Headers object (from Response)
let retryAfterHeader = null;
let retryAfterMsHeader = null;
if (headers) {
if (typeof headers.get === 'function') { // Standard Headers object
retryAfterHeader = headers.get('retry-after') || headers.get('Retry-After');
retryAfterMsHeader = headers.get('retry-after-ms') || headers.get('Retry-After-Ms');
} else if (typeof headers === 'object') { // Plain object from SDK error
// Access properties directly (case-insensitive check)
const lowerCaseHeaders = Object.fromEntries(Object.entries(headers).map(([k, v]) => [k.toLowerCase(), v]));
retryAfterHeader = lowerCaseHeaders['retry-after'];
retryAfterMsHeader = lowerCaseHeaders['retry-after-ms'];
}
}
// Now parse the values if found
if (retryAfterMsHeader) {
const retryAfterMs = parseInt(retryAfterMsHeader, 10);
if (!isNaN(retryAfterMs)) {
specificRetryAfterMs = retryAfterMs;
logger.info(`Retry-After-Ms header found: waiting ${retryAfterMs} ms.`);
} else {
logger.warn(`Could not parse Retry-After-Ms header value: ${retryAfterMsHeader}`);
}
} else if (retryAfterHeader) {
const retryAfterSeconds = parseInt(retryAfterHeader, 10);
if (!isNaN(retryAfterSeconds)) {
specificRetryAfterMs = retryAfterSeconds * 1000;
logger.info(`Retry-After header found: waiting ${retryAfterSeconds} seconds.`);
} else {
logger.warn(`Could not parse Retry-After header value: ${retryAfterHeader}`);
}
}
if (specificRetryAfterMs === null) {
logger.warn('Could not find or parse Retry-After/Retry-After-Ms headers on the rate limit error.');
}
} catch (headerError) {
logger.warn(`Error accessing headers on rate limit error object: ${headerError.message}`);
}
}
// --- Check Max Retries ---
if (attempt >= this.maxRetries - 1) {
logger.error(`Max retries (${this.maxRetries}) reached. Giving up on error:`, { error: { message: error?.message, status: httpStatusCode } });
throw lastError;
}
// --- Calculate Delay ---
let delay = 0;
if (specificRetryAfterMs !== null) {
delay = specificRetryAfterMs; // Use specific API delay
} else {
// Calculate exponential backoff
let exponentialDelay = this.baseDelay * Math.pow(2, attempt);
// Apply jitter
exponentialDelay = exponentialDelay * (0.75 + Math.random() * 0.5);
// Apply max *exponential* delay cap
delay = Math.min(exponentialDelay, this.maxExponentialDelay);
}
// --- Log and Wait ---
const errorMsg = error instanceof Error ? error.message : (httpStatusCode ? `Status ${httpStatusCode}` : 'Unknown error');
logger.warn(`Attempt ${attempt + 1}/${this.maxRetries} failed (${errorMsg}). Retrying in ${Math.round(delay)}ms...`);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
logger.error('Retry loop completed without success or throwing final error. Rethrowing last known error.');
throw lastError || new Error('Operation failed after retries, but last error was lost.');
}
/**
* Saves prompt content to a file if savePrompts is enabled
* @param prompt Prompt content to save
* @param type Type of prompt (e.g., 'thinking', 'response')
*/
async savePromptToFile(prompt, type) {
if (!this.config.savePrompts || !this.config.outputPath) {
return;
}
// Implementation for saving prompts to be added
// This will depend on the file system utilities available in the project
}
/**
* Tries to process the next request in the queue if slots are available.
*/
_tryProcessQueue() {
if (this.activeRequests < this._currentConcurrencyLimit && this.requestQueue.length > 0) {
this.activeRequests++;
const { requestFn, resolve, reject } = this.requestQueue.shift();
const requestId = Date.now() + Math.random().toString(16).slice(2);
logger.debug(`[Parallelism] Starting request ${requestId}. Active: ${this.activeRequests} (Limit: ${this._currentConcurrencyLimit}, Delay: ${this._interRequestDelayMs}ms)`);
Promise.resolve().then(async () => {
// Introduce delay before actually starting the request
if (this._interRequestDelayMs > 0) {
await new Promise(res => setTimeout(res, this._interRequestDelayMs));
}
const startTime = Date.now();
try {
const result = await requestFn();
resolve(result);
} catch (error) {
reject(error);
} finally {
const endTime = Date.now();
const duration = endTime - startTime;
this.activeRequests--;
logger.debug(`[Parallelism] Finished request ${requestId}. Duration: ${duration}ms. Active: ${this.activeRequests}. Queue size: ${this.requestQueue.length}`);
this._tryProcessQueue(); // Try to process next item
}
});
}
}
/**
* Enqueues a request and processes it when a slot is available
* @param {Function} requestFn Function that returns a Promise for the request
* @returns {Promise} Result of the request
*/
async enqueueRequest(requestFn) {
return new Promise((resolve, reject) => {
this.requestQueue.push({ requestFn, resolve, reject });
// Immediately try to process in case a slot is free.
this._tryProcessQueue();
});
}
/**
* Validates and merges model configuration
* @param config Partial model configuration to merge
* @returns Complete model configuration
*/
getMergedConfig(config) {
return {
temperature: config?.temperature ?? 0.7,
maxTokens: config?.maxTokens ?? parseInt(process.env.MAX_TOKENS || '10000', 10),
topP: config?.topP ?? 1,
model: config?.model ?? this.config.model,
maxParallelRequests: config?.maxParallelRequests ?? this.maxParallelRequests
};
}
/**
* Reduces the concurrency limit when a rate limit error occurs.
* Optionally schedules a task to try increasing it later.
*/
_handleRateLimitHit() {
const oldLimit = this._currentConcurrencyLimit;
const oldDelay = this._interRequestDelayMs;
// Reduce concurrency limit
if (this._currentConcurrencyLimit > 1) {
this._currentConcurrencyLimit = Math.max(1, Math.floor(this._currentConcurrencyLimit / 2));
logger.warn(`[RATE_LIMIT_ADAPT] Rate limit hit. Reducing concurrency limit from ${oldLimit} to ${this._currentConcurrencyLimit}.`);
} else {
logger.warn(`[RATE_LIMIT_ADAPT] Rate limit hit, but concurrency limit already at minimum (1).`);
}
// Set a fixed, large inter-request delay immediately
this._interRequestDelayMs = 45000; // Set fixed 45-second delay
logger.warn(`[RATE_LIMIT_ADAPT] Setting fixed inter-request delay to ${this._interRequestDelayMs / 1000} seconds.`);
// Clear any existing timeout trying to increase concurrency/decrease delay
if (this._increaseConcurrencyTimeout) {
clearTimeout(this._increaseConcurrencyTimeout);
this._increaseConcurrencyTimeout = null;
}
// Schedule an attempt to recover concurrency/delay later
const recoveryDelay = 60 * 1000;
logger.info(`[RATE_LIMIT_ADAPT] Scheduling attempt to recover concurrency/delay in ${recoveryDelay / 1000} seconds.`);
this._increaseConcurrencyTimeout = setTimeout(() => {
this._tryRecoverConcurrencyAndDelay();
this._increaseConcurrencyTimeout = null;
}, recoveryDelay);
}
_tryRecoverConcurrencyAndDelay() {
const oldLimit = this._currentConcurrencyLimit;
const oldDelay = this._interRequestDelayMs;
let changed = false;
// Try increasing concurrency
if (this._currentConcurrencyLimit < this.maxParallelRequests) {
this._currentConcurrencyLimit = Math.min(this.maxParallelRequests, this._currentConcurrencyLimit + 1);
changed = true;
}
// Reset delay back to minimum on recovery attempt
if (this._interRequestDelayMs !== this._minInterRequestDelayMs) {
this._interRequestDelayMs = this._minInterRequestDelayMs;
changed = true;
}
if (changed) {
logger.info(`[RATE_LIMIT_ADAPT] Attempting to recover. New limit: ${this._currentConcurrencyLimit}, Resetting delay to: ${this._interRequestDelayMs}ms.`);
} else {
logger.debug('[RATE_LIMIT_ADAPT] Concurrency limit at max and delay at min.');
}
}
}