UNPKG

inference-server

Version:

Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.

github.com/iimez/inference-server

iimez/inference-server

517 lines • 22 kB

JavaScript

import process from 'node:process'; import PQueue from 'p-queue'; import EventEmitter3 from 'eventemitter3'; import { ModelInstance } from './instance.js'; import { LogLevels, createSublogger } from './lib/logger.js'; import { mergeAbortSignals } from './lib/util.js'; export class ModelPool extends EventEmitter3 { queue; config; instances; engines; cleanupInterval; log; requestSequence = 0; pendingRequests = new Set(); shutdownController = new AbortController(); gpuLock = false; // TODO could derive this from "is there any instance that has gpu=true" prepareInstance; constructor(options, prepareInstance) { super(); this.log = createSublogger(options.log); const models = {}; for (const id in options.models) { const modelConfig = options.models[id]; models[id] = { ...modelConfig, id: modelConfig.id ?? id, }; } const config = { concurrency: 1, ...options, models, }; this.queue = new PQueue({ concurrency: config.concurrency, }); this.config = config; this.instances = {}; this.prepareInstance = prepareInstance; } // start up pool, creating instances and loading models async init(engines) { const initPromises = []; const modelConfigs = this.config.models; this.engines = engines; // making sure id is set. for (const modelId in modelConfigs) { const modelConfig = modelConfigs[modelId]; if (!modelConfig.id) { modelConfig.id = modelId; } } // prioritize initializing the first model defined that has gpu explicitly set // so lock cant be acquired first by another model that has gpu=auto/undefined const firstGpuModel = Object.entries(modelConfigs).find(([id, config]) => !!config.device?.gpu && config.device?.gpu !== 'auto'); if (firstGpuModel) { const modelConfig = modelConfigs[firstGpuModel[0]]; const spawnPromises = this.ensureModelInstances(modelConfig); initPromises.push(...spawnPromises); } // then handle other models in the order they were defined for (const modelId in modelConfigs) { if (firstGpuModel && modelId === firstGpuModel[0]) { continue; } const modelConfig = modelConfigs[modelId]; const spawnPromises = this.ensureModelInstances(modelConfig); initPromises.push(...spawnPromises); } if (initPromises.length) { this.log(LogLevels.debug, 'Spawning initial instances ...'); // resolve when all initial instances are loaded await Promise.allSettled(initPromises); const instanceCount = Object.values(this.instances).length; this.log(LogLevels.debug, 'Initial instances ready', { count: instanceCount, }); } this.emit('ready'); this.cleanupInterval = setInterval(() => { this.disposeOutdatedInstances(); }, 1000 * 60); // every minute } // Check if the minInstances for a models are spawned. if not, spawn them. ensureModelInstances(model) { const spawnPromises = []; const instanceCount = model.minInstances ?? 0; for (let i = 0; i < instanceCount; i++) { if (this.canSpawnInstance(model.id)) { const spawnPromise = this.spawnInstance(model.id); spawnPromises.push(spawnPromise); } else { this.log(LogLevels.warn, 'Failed to spawn min instances for', { model: model.id, }); break; } } return spawnPromises; } async dispose() { this.log(LogLevels.debug, 'Disposing pool'); clearInterval(this.cleanupInterval); super.removeAllListeners(); this.queue.pause(); this.queue.clear(); this.shutdownController.abort(); for (const request of this.pendingRequests) { request.abortController.abort(); } const disposePromises = []; for (const key in this.instances) { const instance = this.instances[key]; disposePromises.push(this.disposeInstance(instance)); } await Promise.allSettled(disposePromises); } // disposes instances that have been idle for longer than their ttl disposeOutdatedInstances() { const now = new Date().getTime(); for (const key in this.instances) { const instance = this.instances[key]; const instanceAge = (now - instance.lastUsed) / 1000; const modelInstanceCount = Object.values(this.instances).filter((i) => i.modelId === instance.modelId).length; const minInstanceCount = this.config.models[instance.modelId].minInstances ?? 0; if (modelInstanceCount > minInstanceCount && instanceAge > instance.ttl && instance.status === 'idle') { this.log(LogLevels.info, 'Auto disposing instance', { instance: instance.id, }); this.disposeInstance(instance); } } } getStatus() { const processingInstances = Object.values(this.instances).filter((instance) => instance.status === 'busy'); const poolStatusInfo = { processing: processingInstances.length, pending: this.pendingRequests.size, instances: Object.fromEntries(Object.entries(this.instances).map(([key, instance]) => { return [ key, { model: instance.modelId, status: instance.status, engine: instance.config.engine, device: instance.gpu ? 'gpu' : 'cpu', contextState: instance.getContextStateIdentity(), lastUsed: new Date(instance.lastUsed).toISOString(), }, ]; })), }; return poolStatusInfo; } // checks if another instance can be spawned for given model canSpawnInstance(modelId) { const modelConfig = this.config.models[modelId]; // if the model is configured with gpu=true, interpret that as "it MUST run on gpu" // and prevent spawning more instances if the gpu is already locked. const requiresGpu = !!modelConfig.device?.gpu; if (requiresGpu && this.gpuLock && modelConfig.device?.gpu !== 'auto') { this.log(LogLevels.verbose, 'Cannot spawn new instance: model requires gpu, but its already in use', { model: modelId, }); return false; } // see if we're within maxInstances limit const maxInstances = modelConfig.maxInstances ?? 1; const currentInstances = Object.values(this.instances).filter((instance) => instance.modelId === modelId); if (currentInstances.length >= maxInstances) { this.log(LogLevels.verbose, 'Cannot spawn new instance: maxInstances reached', { model: modelId, curent: currentInstances.length, max: maxInstances, }); return false; } return true; } async disposeInstance(instance) { this.log(LogLevels.debug, 'Disposing instance', { instance: instance.id, }); await instance.dispose(); if (instance.gpu) { this.gpuLock = false; } delete this.instances[instance.id]; } // spawns a new instance for the given model, without checking whether it's allowed async spawnInstance(modelId, options = {}) { if (!this.engines) { throw new Error('No engines available. Make sure the pool is initialized and inferenceServer.start() or ModelPool.init() were called.'); } const model = this.config.models[modelId]; const engine = this.engines[model.engine]; if (!engine) { throw new Error(`Engine not found: ${model.engine}`); } const autoGpuEnabled = !!engine.autoGpu; // if the model is configured with gpu=auto (or unset), we can use the gpu if its not locked const autoGpu = model.device?.gpu === undefined || model.device?.gpu === 'auto'; let useGpu = autoGpu ? autoGpuEnabled && !this.gpuLock : false; if (!!model.device?.gpu) { useGpu = true; } const instance = new ModelInstance(engine, { ...model, gpu: useGpu, log: this.log, }); this.instances[instance.id] = instance; if (useGpu) { this.gpuLock = true; } const signals = [this.shutdownController.signal]; if (options.signal) { signals.push(options.signal); } const abortSignal = mergeAbortSignals(signals); if (this.prepareInstance) { this.log(LogLevels.debug, 'Preparing instance', { instance: instance.id, }); try { await this.prepareInstance(instance, abortSignal); instance.status = 'idle'; } catch (error) { this.log(LogLevels.error, 'Error preparing instance', { model: modelId, instance: instance.id, error, }); instance.status = 'error'; return instance; } } await instance.load(abortSignal); if (options.emit !== false) { this.emit('spawn', instance); } return instance; } // wait to acquire a gpu instance for the given request acquireGpuInstance(request, signal) { return new Promise(async (resolve, reject) => { // if we have an idle gpu instance and the model matches we can lock and return immediately const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true); if (gpuInstance.status === 'idle') { if (gpuInstance.modelId === request.model) { gpuInstance.lock(request); resolve(gpuInstance); return; } else { await this.disposeInstance(gpuInstance); const newInstance = await this.spawnInstance(request.model, { emit: false, }); newInstance.lock(request); resolve(newInstance); return; } } // otherwise attach the listener and wait until gpu slot becomes available const listener = async (instance) => { if (instance.gpu === true && instance.status === 'idle') { if (instance.matchesRequirements(request)) { // model matches whats needed, lock and resolve this.off('release', listener); instance.lock(request); resolve(instance); } else { // model doesnt match, dispose and spawn new instance this.off('release', listener); await this.disposeInstance(instance); const newInstance = await this.spawnInstance(request.model, { emit: false, }); newInstance.lock(request); resolve(newInstance); } } }; this.on('release', listener); if (signal) { signal.addEventListener('abort', () => { this.off('release', listener); reject(signal.reason); }); } }); } // wait to acquire an idle instance for the given request acquireIdleInstance(request, signal) { return new Promise((resolve, reject) => { const listener = (instance) => { if (instance.matchesRequirements(request) && instance.status === 'idle') { this.off('release', listener); this.off('spawn', listener); try { instance.lock(request); resolve(instance); } catch (error) { this.log(LogLevels.error, 'Error acquiring idle instance', { error, }); reject(error); } } }; this.on('spawn', listener); this.on('release', listener); if (signal) { signal.addEventListener('abort', () => { this.off('release', listener); this.off('spawn', listener); reject(signal.reason); }); } }); } // acquire an instance for the given request async acquireInstance(request, signal) { if ('messages' in request || 'prompt' in request) { // for text and chat completions first search for an instance that has the context ready for (const key in this.instances) { const instance = this.instances[key]; if (instance.matchesRequirements(request) && instance.status === 'idle' && instance.matchesContextState(request)) { this.log(LogLevels.debug, 'Cache hit - reusing cached instance', { instance: instance.id, sequence: request.sequence, }); instance.lock(request); return instance; } } this.log(LogLevels.debug, 'Cache miss - continue acquiring model instance', { sequence: request.sequence }); } // prefer an instance of the model that has no context state. for (const key in this.instances) { const instance = this.instances[key]; if (instance.matchesRequirements(request) && instance.status === 'idle' && !instance.hasContextState()) { this.log(LogLevels.debug, 'Reusing idle instance without context state', { instance: instance.id, sequence: request.sequence, }); instance.lock(request); return instance; } } // still havent found any, see if we're allowed to spawn a new instance if (this.canSpawnInstance(request.model)) { const instance = await this.spawnInstance(request.model, { emit: false, }); // reset the context if the request doesnt match the instances preloaded context state const hasInitialContextState = instance.config.initialMessages?.length || instance.config.prefix; if (hasInitialContextState && !instance.matchesContextState(request)) { instance.resetContext(); } this.log(LogLevels.debug, 'Spawned instance acquired', { instance: instance.id, sequence: request.sequence, }); instance.lock(request); return instance; } // if all instances have cached state, prefer the one that was used the longest time ago const availableInstances = Object.values(this.instances).filter((instance) => instance.matchesRequirements(request) && instance.status === 'idle'); if (availableInstances.length > 0) { const lruInstance = availableInstances.reduce((prev, current) => prev.lastUsed < current.lastUsed ? prev : current); this.log(LogLevels.debug, 'Reusing least recently used instance', { instance: lruInstance.id, sequence: request.sequence, }); lruInstance.lock(request); lruInstance.resetContext(); // make sure we reset its cache. return lruInstance; } const requiresGpu = this.config.models[request.model].device?.gpu === true; if (requiresGpu && this.gpuLock) { const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true); if (gpuInstance.modelId !== request.model) { this.log(LogLevels.debug, 'GPU already in use, waiting ...', { sequence: request.sequence, }); const instance = await this.acquireGpuInstance(request, signal); this.log(LogLevels.debug, 'GPU instance acquired', { instance: instance.id, sequence: request.sequence, }); if (signal?.aborted) { instance.unlock(); throw signal.reason; } else { return instance; } } } // before starting to wait, make sure sure we're not stuck with an error'd instance (and wait forever) // currently instances only enter error state if prepareInstance throws an error const errorInstance = Object.values(this.instances).find((instance) => instance.modelId === request.model && instance.status === 'error'); if (errorInstance) { throw new Error('Instance is in error state'); } // wait until an instance of our model is released or spawned this.log(LogLevels.debug, 'Awaiting idle instance', { model: request.model, sequence: request.sequence, }); const instance = await this.acquireIdleInstance(request, signal); this.log(LogLevels.debug, 'Idle instance acquired', { instance: instance.id, sequence: request.sequence, }); if (signal?.aborted) { instance.unlock(); throw signal.reason; } else { return instance; } } createRequestSequence() { if (this.requestSequence > 999999) { this.requestSequence = 0; } return ++this.requestSequence; } // requests a model instance from the pool async requestInstance(params, signal) { if (this.shutdownController.signal.aborted) { throw new Error('Pool is disposed'); } const requestSequence = this.createRequestSequence(); const request = { ...params, sequence: requestSequence, abortController: new AbortController(), }; if (!this.config.models[request.model]) { this.log(LogLevels.error, `Model not found: ${request.model}`); throw new Error(`Model not found: ${request.model}`); } this.log(LogLevels.info, 'Incoming request', { model: request.model, sequence: request.sequence, }); this.pendingRequests.add(request); const abortSignal = mergeAbortSignals([request.abortController.signal, signal]); abortSignal.addEventListener('abort', () => { this.log(LogLevels.info, 'Request aborted', { model: request.model, sequence: request.sequence, }); this.pendingRequests.delete(request); }); const instance = await this.acquireInstance(request, abortSignal); // once instance is acquired & locked, we can pass it on to the caller // the queue task promise will be forwarded as releaseInstance let resolveQueueTask = () => { }; this.queue .add(() => { this.pendingRequests.delete(request); return new Promise((resolve, reject) => { resolveQueueTask = resolve; }); }) .then((task) => { // if there are more requests waiting, prioritize handling them before spawning new instances // deferred to avoid AbortError when the pool is disposed right after the operation process.nextTick(() => { if (!this.pendingRequests.size && this.canSpawnInstance(request.model) && !this.shutdownController.signal.aborted) { this.spawnInstance(request.model); } }); if (task?.instance) { this.emit('release', instance); } }); // TODO what if user never calls release? automatically resolve or reject after a timeout? const releaseInstance = () => { return new Promise((resolve, reject) => { process.nextTick(() => { resolveQueueTask({ instance, request }); this.log(LogLevels.info, 'Task completed, releasing', { instance: instance.id, sequence: request.sequence, }); if (instance.config.ttl === 0) { this.disposeInstance(instance); } else { instance.unlock(); } resolve(); }); }); }; return { instance, release: releaseInstance, }; } } //# sourceMappingURL=pool.js.map