inference-server

Version:

Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.

github.com/iimez/inference-server

iimez/inference-server

591 lines (544 loc) • 18.8 kB

text/typescript

import process from 'node:process' import PQueue from 'p-queue' import EventEmitter3 from 'eventemitter3' import { ModelInstance } from '#package/instance.js' import { ModelConfig, InferenceParams, ModelInstanceRequest, ModelEngine } from '#package/types/index.js' import { Logger, LogLevels, createSublogger, LogLevel } from '#package/lib/logger.js' import { mergeAbortSignals } from '#package/lib/util.js' export interface ModelInstanceHandle { instance: ModelInstance release: () => Promise<void> } interface ModelTask { instance: ModelInstance request: ModelInstanceRequest } type PrepareModelInstanceCallback = (instance: ModelInstance, signal?: AbortSignal) => Promise<void> interface ModelPoolConfig { concurrency: number models: Record<string, ModelConfig> } export interface ModelPoolOptions { concurrency?: number models: Record<string, ModelConfig> log?: Logger | LogLevel } type ModelPoolEvent = 'ready' | 'spawn' | 'release' export class ModelPool extends EventEmitter3<ModelPoolEvent> { queue: PQueue config: ModelPoolConfig instances: Record<string, ModelInstance> private engines?: Record<string, ModelEngine> private cleanupInterval?: NodeJS.Timeout private log: Logger private requestSequence: number = 0 private pendingRequests: Set<ModelInstanceRequest> = new Set() private shutdownController: AbortController = new AbortController() private gpuLock: boolean = false // TODO could derive this from "is there any instance that has gpu=true" private prepareInstance?: PrepareModelInstanceCallback constructor(options: ModelPoolOptions, prepareInstance?: PrepareModelInstanceCallback) { super() this.log = createSublogger(options.log) const models: Record<string, ModelConfig> = {} for (const id in options.models) { const modelConfig = options.models[id] models[id] = { ...modelConfig, id: modelConfig.id ?? id, } } const config: ModelPoolConfig = { concurrency: 1, ...options, models, } this.queue = new PQueue({ concurrency: config.concurrency, }) this.config = config this.instances = {} this.prepareInstance = prepareInstance } // start up pool, creating instances and loading models async init(engines: Record<string, ModelEngine>) { const initPromises = [] const modelConfigs = this.config.models this.engines = engines // making sure id is set. for (const modelId in modelConfigs) { const modelConfig = modelConfigs[modelId] if (!modelConfig.id) { modelConfig.id = modelId } } // prioritize initializing the first model defined that has gpu explicitly set // so lock cant be acquired first by another model that has gpu=auto/undefined const firstGpuModel = Object.entries(modelConfigs).find( ([id, config]) => !!config.device?.gpu && config.device?.gpu !== 'auto', ) if (firstGpuModel) { const modelConfig = modelConfigs[firstGpuModel[0]] const spawnPromises = this.ensureModelInstances(modelConfig) initPromises.push(...spawnPromises) } // then handle other models in the order they were defined for (const modelId in modelConfigs) { if (firstGpuModel && modelId === firstGpuModel[0]) { continue } const modelConfig = modelConfigs[modelId] const spawnPromises = this.ensureModelInstances(modelConfig) initPromises.push(...spawnPromises) } if (initPromises.length) { this.log(LogLevels.debug, 'Spawning initial instances ...') // resolve when all initial instances are loaded await Promise.allSettled(initPromises) const instanceCount = Object.values(this.instances).length this.log(LogLevels.debug, 'Initial instances ready', { count: instanceCount, }) } this.emit('ready') this.cleanupInterval = setInterval(() => { this.disposeOutdatedInstances() }, 1000 * 60) // every minute } // Check if the minInstances for a models are spawned. if not, spawn them. ensureModelInstances(model: ModelConfig) { const spawnPromises = [] const instanceCount = model.minInstances ?? 0 for (let i = 0; i < instanceCount; i++) { if (this.canSpawnInstance(model.id)) { const spawnPromise = this.spawnInstance(model.id) spawnPromises.push(spawnPromise) } else { this.log(LogLevels.warn, 'Failed to spawn min instances for', { model: model.id, }) break } } return spawnPromises } async dispose() { this.log(LogLevels.debug, 'Disposing pool') clearInterval(this.cleanupInterval) super.removeAllListeners() this.queue.pause() this.queue.clear() this.shutdownController.abort() for (const request of this.pendingRequests) { request.abortController.abort() } const disposePromises: Array<Promise<void>> = [] for (const key in this.instances) { const instance = this.instances[key] disposePromises.push(this.disposeInstance(instance)) } await Promise.allSettled(disposePromises) } // disposes instances that have been idle for longer than their ttl private disposeOutdatedInstances() { const now = new Date().getTime() for (const key in this.instances) { const instance = this.instances[key] const instanceAge = (now - instance.lastUsed) / 1000 const modelInstanceCount = Object.values(this.instances).filter((i) => i.modelId === instance.modelId).length const minInstanceCount = this.config.models[instance.modelId].minInstances ?? 0 if (modelInstanceCount > minInstanceCount && instanceAge > instance.ttl && instance.status === 'idle') { this.log(LogLevels.info, 'Auto disposing instance', { instance: instance.id, }) this.disposeInstance(instance) } } } getStatus() { const processingInstances = Object.values(this.instances).filter((instance) => instance.status === 'busy') const poolStatusInfo = { processing: processingInstances.length, pending: this.pendingRequests.size, instances: Object.fromEntries( Object.entries(this.instances).map(([key, instance]) => { return [ key, { model: instance.modelId, status: instance.status, engine: instance.config.engine, device: instance.gpu ? 'gpu' : 'cpu', contextState: instance.getContextStateIdentity(), lastUsed: new Date(instance.lastUsed).toISOString(), }, ] }), ), } return poolStatusInfo } // checks if another instance can be spawned for given model canSpawnInstance(modelId: string) { const modelConfig = this.config.models[modelId] // if the model is configured with gpu=true, interpret that as "it MUST run on gpu" // and prevent spawning more instances if the gpu is already locked. const requiresGpu = !!modelConfig.device?.gpu if (requiresGpu && this.gpuLock && modelConfig.device?.gpu !== 'auto') { this.log(LogLevels.verbose, 'Cannot spawn new instance: model requires gpu, but its already in use', { model: modelId, }) return false } // see if we're within maxInstances limit const maxInstances = modelConfig.maxInstances ?? 1 const currentInstances = Object.values(this.instances).filter((instance) => instance.modelId === modelId) if (currentInstances.length >= maxInstances) { this.log(LogLevels.verbose, 'Cannot spawn new instance: maxInstances reached', { model: modelId, curent: currentInstances.length, max: maxInstances, }) return false } return true } private async disposeInstance(instance: ModelInstance) { this.log(LogLevels.debug, 'Disposing instance', { instance: instance.id, }) await instance.dispose() if (instance.gpu) { this.gpuLock = false } delete this.instances[instance.id] } // spawns a new instance for the given model, without checking whether it's allowed private async spawnInstance(modelId: string, options: { signal?: AbortSignal; emit?: boolean } = {}) { if (!this.engines) { throw new Error( 'No engines available. Make sure the pool is initialized and inferenceServer.start() or ModelPool.init() were called.', ) } const model = this.config.models[modelId] const engine = this.engines[model.engine] if (!engine) { throw new Error(`Engine not found: ${model.engine}`) } const autoGpuEnabled = !!engine.autoGpu // if the model is configured with gpu=auto (or unset), we can use the gpu if its not locked const autoGpu = model.device?.gpu === undefined || model.device?.gpu === 'auto' let useGpu = autoGpu ? autoGpuEnabled && !this.gpuLock : false if (!!model.device?.gpu) { useGpu = true } const instance = new ModelInstance(engine, { ...model, gpu: useGpu, log: this.log, }) this.instances[instance.id] = instance if (useGpu) { this.gpuLock = true } const signals = [this.shutdownController.signal] if (options.signal) { signals.push(options.signal) } const abortSignal = mergeAbortSignals(signals) if (this.prepareInstance) { this.log(LogLevels.debug, 'Preparing instance', { instance: instance.id, }) try { await this.prepareInstance(instance, abortSignal) instance.status = 'idle' } catch (error) { this.log(LogLevels.error, 'Error preparing instance', { model: modelId, instance: instance.id, error, }) instance.status = 'error' return instance } } await instance.load(abortSignal) if (options.emit !== false) { this.emit('spawn', instance) } return instance } // wait to acquire a gpu instance for the given request private acquireGpuInstance(request: ModelInstanceRequest, signal?: AbortSignal): Promise<ModelInstance> { return new Promise(async (resolve, reject) => { // if we have an idle gpu instance and the model matches we can lock and return immediately const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true)! if (gpuInstance.status === 'idle') { if (gpuInstance.modelId === request.model) { gpuInstance.lock(request) resolve(gpuInstance) return } else { await this.disposeInstance(gpuInstance) const newInstance = await this.spawnInstance(request.model, { emit: false, }) newInstance.lock(request) resolve(newInstance) return } } // otherwise attach the listener and wait until gpu slot becomes available const listener = async (instance: ModelInstance) => { if (instance.gpu === true && instance.status === 'idle') { if (instance.matchesRequirements(request)) { // model matches whats needed, lock and resolve this.off('release', listener) instance.lock(request) resolve(instance) } else { // model doesnt match, dispose and spawn new instance this.off('release', listener) await this.disposeInstance(instance) const newInstance = await this.spawnInstance(request.model, { emit: false, }) newInstance.lock(request) resolve(newInstance) } } } this.on('release', listener) if (signal) { signal.addEventListener('abort', () => { this.off('release', listener) reject(signal.reason) }) } }) } // wait to acquire an idle instance for the given request private acquireIdleInstance(request: ModelInstanceRequest, signal?: AbortSignal): Promise<ModelInstance> { return new Promise((resolve, reject) => { const listener = (instance: ModelInstance) => { if (instance.matchesRequirements(request) && instance.status === 'idle') { this.off('release', listener) this.off('spawn', listener) try { instance.lock(request) resolve(instance) } catch (error: any) { this.log(LogLevels.error, 'Error acquiring idle instance', { error, }) reject(error) } } } this.on('spawn', listener) this.on('release', listener) if (signal) { signal.addEventListener('abort', () => { this.off('release', listener) this.off('spawn', listener) reject(signal.reason) }) } }) } // acquire an instance for the given request private async acquireInstance(request: ModelInstanceRequest, signal?: AbortSignal) { if ('messages' in request || 'prompt' in request) { // for text and chat completions first search for an instance that has the context ready for (const key in this.instances) { const instance = this.instances[key] if ( instance.matchesRequirements(request) && instance.status === 'idle' && instance.matchesContextState(request) ) { this.log(LogLevels.debug, 'Cache hit - reusing cached instance', { instance: instance.id, sequence: request.sequence, }) instance.lock(request) return instance } } this.log(LogLevels.debug, 'Cache miss - continue acquiring model instance', { sequence: request.sequence }) } // prefer an instance of the model that has no context state. for (const key in this.instances) { const instance = this.instances[key] if (instance.matchesRequirements(request) && instance.status === 'idle' && !instance.hasContextState()) { this.log(LogLevels.debug, 'Reusing idle instance without context state', { instance: instance.id, sequence: request.sequence, }) instance.lock(request) return instance } } // still havent found any, see if we're allowed to spawn a new instance if (this.canSpawnInstance(request.model)) { const instance = await this.spawnInstance(request.model, { emit: false, }) // reset the context if the request doesnt match the instances preloaded context state const hasInitialContextState = instance.config.initialMessages?.length || instance.config.prefix if (hasInitialContextState && !instance.matchesContextState(request)) { instance.resetContext() } this.log(LogLevels.debug, 'Spawned instance acquired', { instance: instance.id, sequence: request.sequence, }) instance.lock(request) return instance } // if all instances have cached state, prefer the one that was used the longest time ago const availableInstances = Object.values(this.instances).filter( (instance) => instance.matchesRequirements(request) && instance.status === 'idle', ) if (availableInstances.length > 0) { const lruInstance = availableInstances.reduce((prev, current) => prev.lastUsed < current.lastUsed ? prev : current, ) this.log(LogLevels.debug, 'Reusing least recently used instance', { instance: lruInstance.id, sequence: request.sequence, }) lruInstance.lock(request) lruInstance.resetContext() // make sure we reset its cache. return lruInstance } const requiresGpu = this.config.models[request.model].device?.gpu === true if (requiresGpu && this.gpuLock) { const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true)! if (gpuInstance.modelId !== request.model) { this.log(LogLevels.debug, 'GPU already in use, waiting ...', { sequence: request.sequence, }) const instance = await this.acquireGpuInstance(request, signal) this.log(LogLevels.debug, 'GPU instance acquired', { instance: instance.id, sequence: request.sequence, }) if (signal?.aborted) { instance.unlock() throw signal.reason } else { return instance } } } // before starting to wait, make sure sure we're not stuck with an error'd instance (and wait forever) // currently instances only enter error state if prepareInstance throws an error const errorInstance = Object.values(this.instances).find( (instance) => instance.modelId === request.model && instance.status === 'error', ) if (errorInstance) { throw new Error('Instance is in error state') } // wait until an instance of our model is released or spawned this.log(LogLevels.debug, 'Awaiting idle instance', { model: request.model, sequence: request.sequence, }) const instance = await this.acquireIdleInstance(request, signal) this.log(LogLevels.debug, 'Idle instance acquired', { instance: instance.id, sequence: request.sequence, }) if (signal?.aborted) { instance.unlock() throw signal.reason } else { return instance } } private createRequestSequence() { if (this.requestSequence > 999999) { this.requestSequence = 0 } return ++this.requestSequence } // requests a model instance from the pool async requestInstance(params: Partial<InferenceParams>, signal?: AbortSignal): Promise<ModelInstanceHandle> { if (this.shutdownController.signal.aborted) { throw new Error('Pool is disposed') } const requestSequence = this.createRequestSequence() const request = { ...params as InferenceParams, sequence: requestSequence, abortController: new AbortController(), } if (!this.config.models[request.model]) { this.log(LogLevels.error, `Model not found: ${request.model}`) throw new Error(`Model not found: ${request.model}`) } this.log(LogLevels.info, 'Incoming request', { model: request.model, sequence: request.sequence, }) this.pendingRequests.add(request) const abortSignal = mergeAbortSignals([request.abortController.signal, signal]) abortSignal.addEventListener('abort', () => { this.log(LogLevels.info, 'Request aborted', { model: request.model, sequence: request.sequence, }) this.pendingRequests.delete(request) }) const instance = await this.acquireInstance(request, abortSignal) // once instance is acquired & locked, we can pass it on to the caller // the queue task promise will be forwarded as releaseInstance let resolveQueueTask: (value: ModelTask) => void = () => {} this.queue .add((): Promise<ModelTask> => { this.pendingRequests.delete(request) return new Promise((resolve, reject) => { resolveQueueTask = resolve }) }) .then((task) => { // if there are more requests waiting, prioritize handling them before spawning new instances // deferred to avoid AbortError when the pool is disposed right after the operation process.nextTick(() => { if ( !this.pendingRequests.size && this.canSpawnInstance(request.model) && !this.shutdownController.signal.aborted ) { this.spawnInstance(request.model) } }) if (task?.instance) { this.emit('release', instance) } }) // TODO what if user never calls release? automatically resolve or reject after a timeout? const releaseInstance = () => { return new Promise<void>((resolve, reject) => { process.nextTick(() => { resolveQueueTask({ instance, request }) this.log(LogLevels.info, 'Task completed, releasing', { instance: instance.id, sequence: request.sequence, }) if (instance.config.ttl === 0) { this.disposeInstance(instance) } else { instance.unlock() } resolve() }) }) } return { instance, release: releaseInstance, } } }