inference-server
Version:
Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.
591 lines (544 loc) • 18.8 kB
text/typescript
import process from 'node:process'
import PQueue from 'p-queue'
import EventEmitter3 from 'eventemitter3'
import { ModelInstance } from '#package/instance.js'
import { ModelConfig, InferenceParams, ModelInstanceRequest, ModelEngine } from '#package/types/index.js'
import { Logger, LogLevels, createSublogger, LogLevel } from '#package/lib/logger.js'
import { mergeAbortSignals } from '#package/lib/util.js'
export interface ModelInstanceHandle {
instance: ModelInstance
release: () => Promise<void>
}
interface ModelTask {
instance: ModelInstance
request: ModelInstanceRequest
}
type PrepareModelInstanceCallback = (instance: ModelInstance, signal?: AbortSignal) => Promise<void>
interface ModelPoolConfig {
concurrency: number
models: Record<string, ModelConfig>
}
export interface ModelPoolOptions {
concurrency?: number
models: Record<string, ModelConfig>
log?: Logger | LogLevel
}
type ModelPoolEvent = 'ready' | 'spawn' | 'release'
export class ModelPool extends EventEmitter3<ModelPoolEvent> {
queue: PQueue
config: ModelPoolConfig
instances: Record<string, ModelInstance>
private engines?: Record<string, ModelEngine>
private cleanupInterval?: NodeJS.Timeout
private log: Logger
private requestSequence: number = 0
private pendingRequests: Set<ModelInstanceRequest> = new Set()
private shutdownController: AbortController = new AbortController()
private gpuLock: boolean = false // TODO could derive this from "is there any instance that has gpu=true"
private prepareInstance?: PrepareModelInstanceCallback
constructor(options: ModelPoolOptions, prepareInstance?: PrepareModelInstanceCallback) {
super()
this.log = createSublogger(options.log)
const models: Record<string, ModelConfig> = {}
for (const id in options.models) {
const modelConfig = options.models[id]
models[id] = {
...modelConfig,
id: modelConfig.id ?? id,
}
}
const config: ModelPoolConfig = {
concurrency: 1,
...options,
models,
}
this.queue = new PQueue({
concurrency: config.concurrency,
})
this.config = config
this.instances = {}
this.prepareInstance = prepareInstance
}
// start up pool, creating instances and loading models
async init(engines: Record<string, ModelEngine>) {
const initPromises = []
const modelConfigs = this.config.models
this.engines = engines
// making sure id is set.
for (const modelId in modelConfigs) {
const modelConfig = modelConfigs[modelId]
if (!modelConfig.id) {
modelConfig.id = modelId
}
}
// prioritize initializing the first model defined that has gpu explicitly set
// so lock cant be acquired first by another model that has gpu=auto/undefined
const firstGpuModel = Object.entries(modelConfigs).find(
([id, config]) => !!config.device?.gpu && config.device?.gpu !== 'auto',
)
if (firstGpuModel) {
const modelConfig = modelConfigs[firstGpuModel[0]]
const spawnPromises = this.ensureModelInstances(modelConfig)
initPromises.push(...spawnPromises)
}
// then handle other models in the order they were defined
for (const modelId in modelConfigs) {
if (firstGpuModel && modelId === firstGpuModel[0]) {
continue
}
const modelConfig = modelConfigs[modelId]
const spawnPromises = this.ensureModelInstances(modelConfig)
initPromises.push(...spawnPromises)
}
if (initPromises.length) {
this.log(LogLevels.debug, 'Spawning initial instances ...')
// resolve when all initial instances are loaded
await Promise.allSettled(initPromises)
const instanceCount = Object.values(this.instances).length
this.log(LogLevels.debug, 'Initial instances ready', {
count: instanceCount,
})
}
this.emit('ready')
this.cleanupInterval = setInterval(() => {
this.disposeOutdatedInstances()
}, 1000 * 60) // every minute
}
// Check if the minInstances for a models are spawned. if not, spawn them.
ensureModelInstances(model: ModelConfig) {
const spawnPromises = []
const instanceCount = model.minInstances ?? 0
for (let i = 0; i < instanceCount; i++) {
if (this.canSpawnInstance(model.id)) {
const spawnPromise = this.spawnInstance(model.id)
spawnPromises.push(spawnPromise)
} else {
this.log(LogLevels.warn, 'Failed to spawn min instances for', {
model: model.id,
})
break
}
}
return spawnPromises
}
async dispose() {
this.log(LogLevels.debug, 'Disposing pool')
clearInterval(this.cleanupInterval)
super.removeAllListeners()
this.queue.pause()
this.queue.clear()
this.shutdownController.abort()
for (const request of this.pendingRequests) {
request.abortController.abort()
}
const disposePromises: Array<Promise<void>> = []
for (const key in this.instances) {
const instance = this.instances[key]
disposePromises.push(this.disposeInstance(instance))
}
await Promise.allSettled(disposePromises)
}
// disposes instances that have been idle for longer than their ttl
private disposeOutdatedInstances() {
const now = new Date().getTime()
for (const key in this.instances) {
const instance = this.instances[key]
const instanceAge = (now - instance.lastUsed) / 1000
const modelInstanceCount = Object.values(this.instances).filter((i) => i.modelId === instance.modelId).length
const minInstanceCount = this.config.models[instance.modelId].minInstances ?? 0
if (modelInstanceCount > minInstanceCount && instanceAge > instance.ttl && instance.status === 'idle') {
this.log(LogLevels.info, 'Auto disposing instance', {
instance: instance.id,
})
this.disposeInstance(instance)
}
}
}
getStatus() {
const processingInstances = Object.values(this.instances).filter((instance) => instance.status === 'busy')
const poolStatusInfo = {
processing: processingInstances.length,
pending: this.pendingRequests.size,
instances: Object.fromEntries(
Object.entries(this.instances).map(([key, instance]) => {
return [
key,
{
model: instance.modelId,
status: instance.status,
engine: instance.config.engine,
device: instance.gpu ? 'gpu' : 'cpu',
contextState: instance.getContextStateIdentity(),
lastUsed: new Date(instance.lastUsed).toISOString(),
},
]
}),
),
}
return poolStatusInfo
}
// checks if another instance can be spawned for given model
canSpawnInstance(modelId: string) {
const modelConfig = this.config.models[modelId]
// if the model is configured with gpu=true, interpret that as "it MUST run on gpu"
// and prevent spawning more instances if the gpu is already locked.
const requiresGpu = !!modelConfig.device?.gpu
if (requiresGpu && this.gpuLock && modelConfig.device?.gpu !== 'auto') {
this.log(LogLevels.verbose, 'Cannot spawn new instance: model requires gpu, but its already in use', {
model: modelId,
})
return false
}
// see if we're within maxInstances limit
const maxInstances = modelConfig.maxInstances ?? 1
const currentInstances = Object.values(this.instances).filter((instance) => instance.modelId === modelId)
if (currentInstances.length >= maxInstances) {
this.log(LogLevels.verbose, 'Cannot spawn new instance: maxInstances reached', {
model: modelId,
curent: currentInstances.length,
max: maxInstances,
})
return false
}
return true
}
private async disposeInstance(instance: ModelInstance) {
this.log(LogLevels.debug, 'Disposing instance', {
instance: instance.id,
})
await instance.dispose()
if (instance.gpu) {
this.gpuLock = false
}
delete this.instances[instance.id]
}
// spawns a new instance for the given model, without checking whether it's allowed
private async spawnInstance(modelId: string, options: { signal?: AbortSignal; emit?: boolean } = {}) {
if (!this.engines) {
throw new Error(
'No engines available. Make sure the pool is initialized and inferenceServer.start() or ModelPool.init() were called.',
)
}
const model = this.config.models[modelId]
const engine = this.engines[model.engine]
if (!engine) {
throw new Error(`Engine not found: ${model.engine}`)
}
const autoGpuEnabled = !!engine.autoGpu
// if the model is configured with gpu=auto (or unset), we can use the gpu if its not locked
const autoGpu = model.device?.gpu === undefined || model.device?.gpu === 'auto'
let useGpu = autoGpu ? autoGpuEnabled && !this.gpuLock : false
if (!!model.device?.gpu) {
useGpu = true
}
const instance = new ModelInstance(engine, {
...model,
gpu: useGpu,
log: this.log,
})
this.instances[instance.id] = instance
if (useGpu) {
this.gpuLock = true
}
const signals = [this.shutdownController.signal]
if (options.signal) {
signals.push(options.signal)
}
const abortSignal = mergeAbortSignals(signals)
if (this.prepareInstance) {
this.log(LogLevels.debug, 'Preparing instance', {
instance: instance.id,
})
try {
await this.prepareInstance(instance, abortSignal)
instance.status = 'idle'
} catch (error) {
this.log(LogLevels.error, 'Error preparing instance', {
model: modelId,
instance: instance.id,
error,
})
instance.status = 'error'
return instance
}
}
await instance.load(abortSignal)
if (options.emit !== false) {
this.emit('spawn', instance)
}
return instance
}
// wait to acquire a gpu instance for the given request
private acquireGpuInstance(request: ModelInstanceRequest, signal?: AbortSignal): Promise<ModelInstance> {
return new Promise(async (resolve, reject) => {
// if we have an idle gpu instance and the model matches we can lock and return immediately
const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true)!
if (gpuInstance.status === 'idle') {
if (gpuInstance.modelId === request.model) {
gpuInstance.lock(request)
resolve(gpuInstance)
return
} else {
await this.disposeInstance(gpuInstance)
const newInstance = await this.spawnInstance(request.model, {
emit: false,
})
newInstance.lock(request)
resolve(newInstance)
return
}
}
// otherwise attach the listener and wait until gpu slot becomes available
const listener = async (instance: ModelInstance) => {
if (instance.gpu === true && instance.status === 'idle') {
if (instance.matchesRequirements(request)) {
// model matches whats needed, lock and resolve
this.off('release', listener)
instance.lock(request)
resolve(instance)
} else {
// model doesnt match, dispose and spawn new instance
this.off('release', listener)
await this.disposeInstance(instance)
const newInstance = await this.spawnInstance(request.model, {
emit: false,
})
newInstance.lock(request)
resolve(newInstance)
}
}
}
this.on('release', listener)
if (signal) {
signal.addEventListener('abort', () => {
this.off('release', listener)
reject(signal.reason)
})
}
})
}
// wait to acquire an idle instance for the given request
private acquireIdleInstance(request: ModelInstanceRequest, signal?: AbortSignal): Promise<ModelInstance> {
return new Promise((resolve, reject) => {
const listener = (instance: ModelInstance) => {
if (instance.matchesRequirements(request) && instance.status === 'idle') {
this.off('release', listener)
this.off('spawn', listener)
try {
instance.lock(request)
resolve(instance)
} catch (error: any) {
this.log(LogLevels.error, 'Error acquiring idle instance', {
error,
})
reject(error)
}
}
}
this.on('spawn', listener)
this.on('release', listener)
if (signal) {
signal.addEventListener('abort', () => {
this.off('release', listener)
this.off('spawn', listener)
reject(signal.reason)
})
}
})
}
// acquire an instance for the given request
private async acquireInstance(request: ModelInstanceRequest, signal?: AbortSignal) {
if ('messages' in request || 'prompt' in request) {
// for text and chat completions first search for an instance that has the context ready
for (const key in this.instances) {
const instance = this.instances[key]
if (
instance.matchesRequirements(request) &&
instance.status === 'idle' &&
instance.matchesContextState(request)
) {
this.log(LogLevels.debug, 'Cache hit - reusing cached instance', {
instance: instance.id,
sequence: request.sequence,
})
instance.lock(request)
return instance
}
}
this.log(LogLevels.debug, 'Cache miss - continue acquiring model instance', { sequence: request.sequence })
}
// prefer an instance of the model that has no context state.
for (const key in this.instances) {
const instance = this.instances[key]
if (instance.matchesRequirements(request) && instance.status === 'idle' && !instance.hasContextState()) {
this.log(LogLevels.debug, 'Reusing idle instance without context state', {
instance: instance.id,
sequence: request.sequence,
})
instance.lock(request)
return instance
}
}
// still havent found any, see if we're allowed to spawn a new instance
if (this.canSpawnInstance(request.model)) {
const instance = await this.spawnInstance(request.model, {
emit: false,
})
// reset the context if the request doesnt match the instances preloaded context state
const hasInitialContextState = instance.config.initialMessages?.length || instance.config.prefix
if (hasInitialContextState && !instance.matchesContextState(request)) {
instance.resetContext()
}
this.log(LogLevels.debug, 'Spawned instance acquired', {
instance: instance.id,
sequence: request.sequence,
})
instance.lock(request)
return instance
}
// if all instances have cached state, prefer the one that was used the longest time ago
const availableInstances = Object.values(this.instances).filter(
(instance) => instance.matchesRequirements(request) && instance.status === 'idle',
)
if (availableInstances.length > 0) {
const lruInstance = availableInstances.reduce((prev, current) =>
prev.lastUsed < current.lastUsed ? prev : current,
)
this.log(LogLevels.debug, 'Reusing least recently used instance', {
instance: lruInstance.id,
sequence: request.sequence,
})
lruInstance.lock(request)
lruInstance.resetContext() // make sure we reset its cache.
return lruInstance
}
const requiresGpu = this.config.models[request.model].device?.gpu === true
if (requiresGpu && this.gpuLock) {
const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true)!
if (gpuInstance.modelId !== request.model) {
this.log(LogLevels.debug, 'GPU already in use, waiting ...', {
sequence: request.sequence,
})
const instance = await this.acquireGpuInstance(request, signal)
this.log(LogLevels.debug, 'GPU instance acquired', {
instance: instance.id,
sequence: request.sequence,
})
if (signal?.aborted) {
instance.unlock()
throw signal.reason
} else {
return instance
}
}
}
// before starting to wait, make sure sure we're not stuck with an error'd instance (and wait forever)
// currently instances only enter error state if prepareInstance throws an error
const errorInstance = Object.values(this.instances).find(
(instance) => instance.modelId === request.model && instance.status === 'error',
)
if (errorInstance) {
throw new Error('Instance is in error state')
}
// wait until an instance of our model is released or spawned
this.log(LogLevels.debug, 'Awaiting idle instance', {
model: request.model,
sequence: request.sequence,
})
const instance = await this.acquireIdleInstance(request, signal)
this.log(LogLevels.debug, 'Idle instance acquired', {
instance: instance.id,
sequence: request.sequence,
})
if (signal?.aborted) {
instance.unlock()
throw signal.reason
} else {
return instance
}
}
private createRequestSequence() {
if (this.requestSequence > 999999) {
this.requestSequence = 0
}
return ++this.requestSequence
}
// requests a model instance from the pool
async requestInstance(params: Partial<InferenceParams>, signal?: AbortSignal): Promise<ModelInstanceHandle> {
if (this.shutdownController.signal.aborted) {
throw new Error('Pool is disposed')
}
const requestSequence = this.createRequestSequence()
const request = {
...params as InferenceParams,
sequence: requestSequence,
abortController: new AbortController(),
}
if (!this.config.models[request.model]) {
this.log(LogLevels.error, `Model not found: ${request.model}`)
throw new Error(`Model not found: ${request.model}`)
}
this.log(LogLevels.info, 'Incoming request', {
model: request.model,
sequence: request.sequence,
})
this.pendingRequests.add(request)
const abortSignal = mergeAbortSignals([request.abortController.signal, signal])
abortSignal.addEventListener('abort', () => {
this.log(LogLevels.info, 'Request aborted', {
model: request.model,
sequence: request.sequence,
})
this.pendingRequests.delete(request)
})
const instance = await this.acquireInstance(request, abortSignal)
// once instance is acquired & locked, we can pass it on to the caller
// the queue task promise will be forwarded as releaseInstance
let resolveQueueTask: (value: ModelTask) => void = () => {}
this.queue
.add((): Promise<ModelTask> => {
this.pendingRequests.delete(request)
return new Promise((resolve, reject) => {
resolveQueueTask = resolve
})
})
.then((task) => {
// if there are more requests waiting, prioritize handling them before spawning new instances
// deferred to avoid AbortError when the pool is disposed right after the operation
process.nextTick(() => {
if (
!this.pendingRequests.size &&
this.canSpawnInstance(request.model) &&
!this.shutdownController.signal.aborted
) {
this.spawnInstance(request.model)
}
})
if (task?.instance) {
this.emit('release', instance)
}
})
// TODO what if user never calls release? automatically resolve or reject after a timeout?
const releaseInstance = () => {
return new Promise<void>((resolve, reject) => {
process.nextTick(() => {
resolveQueueTask({ instance, request })
this.log(LogLevels.info, 'Task completed, releasing', {
instance: instance.id,
sequence: request.sequence,
})
if (instance.config.ttl === 0) {
this.disposeInstance(instance)
} else {
instance.unlock()
}
resolve()
})
})
}
return {
instance,
release: releaseInstance,
}
}
}