inference-server
Version:
Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.
517 lines • 22 kB
JavaScript
import process from 'node:process';
import PQueue from 'p-queue';
import EventEmitter3 from 'eventemitter3';
import { ModelInstance } from './instance.js';
import { LogLevels, createSublogger } from './lib/logger.js';
import { mergeAbortSignals } from './lib/util.js';
export class ModelPool extends EventEmitter3 {
queue;
config;
instances;
engines;
cleanupInterval;
log;
requestSequence = 0;
pendingRequests = new Set();
shutdownController = new AbortController();
gpuLock = false; // TODO could derive this from "is there any instance that has gpu=true"
prepareInstance;
constructor(options, prepareInstance) {
super();
this.log = createSublogger(options.log);
const models = {};
for (const id in options.models) {
const modelConfig = options.models[id];
models[id] = {
...modelConfig,
id: modelConfig.id ?? id,
};
}
const config = {
concurrency: 1,
...options,
models,
};
this.queue = new PQueue({
concurrency: config.concurrency,
});
this.config = config;
this.instances = {};
this.prepareInstance = prepareInstance;
}
// start up pool, creating instances and loading models
async init(engines) {
const initPromises = [];
const modelConfigs = this.config.models;
this.engines = engines;
// making sure id is set.
for (const modelId in modelConfigs) {
const modelConfig = modelConfigs[modelId];
if (!modelConfig.id) {
modelConfig.id = modelId;
}
}
// prioritize initializing the first model defined that has gpu explicitly set
// so lock cant be acquired first by another model that has gpu=auto/undefined
const firstGpuModel = Object.entries(modelConfigs).find(([id, config]) => !!config.device?.gpu && config.device?.gpu !== 'auto');
if (firstGpuModel) {
const modelConfig = modelConfigs[firstGpuModel[0]];
const spawnPromises = this.ensureModelInstances(modelConfig);
initPromises.push(...spawnPromises);
}
// then handle other models in the order they were defined
for (const modelId in modelConfigs) {
if (firstGpuModel && modelId === firstGpuModel[0]) {
continue;
}
const modelConfig = modelConfigs[modelId];
const spawnPromises = this.ensureModelInstances(modelConfig);
initPromises.push(...spawnPromises);
}
if (initPromises.length) {
this.log(LogLevels.debug, 'Spawning initial instances ...');
// resolve when all initial instances are loaded
await Promise.allSettled(initPromises);
const instanceCount = Object.values(this.instances).length;
this.log(LogLevels.debug, 'Initial instances ready', {
count: instanceCount,
});
}
this.emit('ready');
this.cleanupInterval = setInterval(() => {
this.disposeOutdatedInstances();
}, 1000 * 60); // every minute
}
// Check if the minInstances for a models are spawned. if not, spawn them.
ensureModelInstances(model) {
const spawnPromises = [];
const instanceCount = model.minInstances ?? 0;
for (let i = 0; i < instanceCount; i++) {
if (this.canSpawnInstance(model.id)) {
const spawnPromise = this.spawnInstance(model.id);
spawnPromises.push(spawnPromise);
}
else {
this.log(LogLevels.warn, 'Failed to spawn min instances for', {
model: model.id,
});
break;
}
}
return spawnPromises;
}
async dispose() {
this.log(LogLevels.debug, 'Disposing pool');
clearInterval(this.cleanupInterval);
super.removeAllListeners();
this.queue.pause();
this.queue.clear();
this.shutdownController.abort();
for (const request of this.pendingRequests) {
request.abortController.abort();
}
const disposePromises = [];
for (const key in this.instances) {
const instance = this.instances[key];
disposePromises.push(this.disposeInstance(instance));
}
await Promise.allSettled(disposePromises);
}
// disposes instances that have been idle for longer than their ttl
disposeOutdatedInstances() {
const now = new Date().getTime();
for (const key in this.instances) {
const instance = this.instances[key];
const instanceAge = (now - instance.lastUsed) / 1000;
const modelInstanceCount = Object.values(this.instances).filter((i) => i.modelId === instance.modelId).length;
const minInstanceCount = this.config.models[instance.modelId].minInstances ?? 0;
if (modelInstanceCount > minInstanceCount && instanceAge > instance.ttl && instance.status === 'idle') {
this.log(LogLevels.info, 'Auto disposing instance', {
instance: instance.id,
});
this.disposeInstance(instance);
}
}
}
getStatus() {
const processingInstances = Object.values(this.instances).filter((instance) => instance.status === 'busy');
const poolStatusInfo = {
processing: processingInstances.length,
pending: this.pendingRequests.size,
instances: Object.fromEntries(Object.entries(this.instances).map(([key, instance]) => {
return [
key,
{
model: instance.modelId,
status: instance.status,
engine: instance.config.engine,
device: instance.gpu ? 'gpu' : 'cpu',
contextState: instance.getContextStateIdentity(),
lastUsed: new Date(instance.lastUsed).toISOString(),
},
];
})),
};
return poolStatusInfo;
}
// checks if another instance can be spawned for given model
canSpawnInstance(modelId) {
const modelConfig = this.config.models[modelId];
// if the model is configured with gpu=true, interpret that as "it MUST run on gpu"
// and prevent spawning more instances if the gpu is already locked.
const requiresGpu = !!modelConfig.device?.gpu;
if (requiresGpu && this.gpuLock && modelConfig.device?.gpu !== 'auto') {
this.log(LogLevels.verbose, 'Cannot spawn new instance: model requires gpu, but its already in use', {
model: modelId,
});
return false;
}
// see if we're within maxInstances limit
const maxInstances = modelConfig.maxInstances ?? 1;
const currentInstances = Object.values(this.instances).filter((instance) => instance.modelId === modelId);
if (currentInstances.length >= maxInstances) {
this.log(LogLevels.verbose, 'Cannot spawn new instance: maxInstances reached', {
model: modelId,
curent: currentInstances.length,
max: maxInstances,
});
return false;
}
return true;
}
async disposeInstance(instance) {
this.log(LogLevels.debug, 'Disposing instance', {
instance: instance.id,
});
await instance.dispose();
if (instance.gpu) {
this.gpuLock = false;
}
delete this.instances[instance.id];
}
// spawns a new instance for the given model, without checking whether it's allowed
async spawnInstance(modelId, options = {}) {
if (!this.engines) {
throw new Error('No engines available. Make sure the pool is initialized and inferenceServer.start() or ModelPool.init() were called.');
}
const model = this.config.models[modelId];
const engine = this.engines[model.engine];
if (!engine) {
throw new Error(`Engine not found: ${model.engine}`);
}
const autoGpuEnabled = !!engine.autoGpu;
// if the model is configured with gpu=auto (or unset), we can use the gpu if its not locked
const autoGpu = model.device?.gpu === undefined || model.device?.gpu === 'auto';
let useGpu = autoGpu ? autoGpuEnabled && !this.gpuLock : false;
if (!!model.device?.gpu) {
useGpu = true;
}
const instance = new ModelInstance(engine, {
...model,
gpu: useGpu,
log: this.log,
});
this.instances[instance.id] = instance;
if (useGpu) {
this.gpuLock = true;
}
const signals = [this.shutdownController.signal];
if (options.signal) {
signals.push(options.signal);
}
const abortSignal = mergeAbortSignals(signals);
if (this.prepareInstance) {
this.log(LogLevels.debug, 'Preparing instance', {
instance: instance.id,
});
try {
await this.prepareInstance(instance, abortSignal);
instance.status = 'idle';
}
catch (error) {
this.log(LogLevels.error, 'Error preparing instance', {
model: modelId,
instance: instance.id,
error,
});
instance.status = 'error';
return instance;
}
}
await instance.load(abortSignal);
if (options.emit !== false) {
this.emit('spawn', instance);
}
return instance;
}
// wait to acquire a gpu instance for the given request
acquireGpuInstance(request, signal) {
return new Promise(async (resolve, reject) => {
// if we have an idle gpu instance and the model matches we can lock and return immediately
const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true);
if (gpuInstance.status === 'idle') {
if (gpuInstance.modelId === request.model) {
gpuInstance.lock(request);
resolve(gpuInstance);
return;
}
else {
await this.disposeInstance(gpuInstance);
const newInstance = await this.spawnInstance(request.model, {
emit: false,
});
newInstance.lock(request);
resolve(newInstance);
return;
}
}
// otherwise attach the listener and wait until gpu slot becomes available
const listener = async (instance) => {
if (instance.gpu === true && instance.status === 'idle') {
if (instance.matchesRequirements(request)) {
// model matches whats needed, lock and resolve
this.off('release', listener);
instance.lock(request);
resolve(instance);
}
else {
// model doesnt match, dispose and spawn new instance
this.off('release', listener);
await this.disposeInstance(instance);
const newInstance = await this.spawnInstance(request.model, {
emit: false,
});
newInstance.lock(request);
resolve(newInstance);
}
}
};
this.on('release', listener);
if (signal) {
signal.addEventListener('abort', () => {
this.off('release', listener);
reject(signal.reason);
});
}
});
}
// wait to acquire an idle instance for the given request
acquireIdleInstance(request, signal) {
return new Promise((resolve, reject) => {
const listener = (instance) => {
if (instance.matchesRequirements(request) && instance.status === 'idle') {
this.off('release', listener);
this.off('spawn', listener);
try {
instance.lock(request);
resolve(instance);
}
catch (error) {
this.log(LogLevels.error, 'Error acquiring idle instance', {
error,
});
reject(error);
}
}
};
this.on('spawn', listener);
this.on('release', listener);
if (signal) {
signal.addEventListener('abort', () => {
this.off('release', listener);
this.off('spawn', listener);
reject(signal.reason);
});
}
});
}
// acquire an instance for the given request
async acquireInstance(request, signal) {
if ('messages' in request || 'prompt' in request) {
// for text and chat completions first search for an instance that has the context ready
for (const key in this.instances) {
const instance = this.instances[key];
if (instance.matchesRequirements(request) &&
instance.status === 'idle' &&
instance.matchesContextState(request)) {
this.log(LogLevels.debug, 'Cache hit - reusing cached instance', {
instance: instance.id,
sequence: request.sequence,
});
instance.lock(request);
return instance;
}
}
this.log(LogLevels.debug, 'Cache miss - continue acquiring model instance', { sequence: request.sequence });
}
// prefer an instance of the model that has no context state.
for (const key in this.instances) {
const instance = this.instances[key];
if (instance.matchesRequirements(request) && instance.status === 'idle' && !instance.hasContextState()) {
this.log(LogLevels.debug, 'Reusing idle instance without context state', {
instance: instance.id,
sequence: request.sequence,
});
instance.lock(request);
return instance;
}
}
// still havent found any, see if we're allowed to spawn a new instance
if (this.canSpawnInstance(request.model)) {
const instance = await this.spawnInstance(request.model, {
emit: false,
});
// reset the context if the request doesnt match the instances preloaded context state
const hasInitialContextState = instance.config.initialMessages?.length || instance.config.prefix;
if (hasInitialContextState && !instance.matchesContextState(request)) {
instance.resetContext();
}
this.log(LogLevels.debug, 'Spawned instance acquired', {
instance: instance.id,
sequence: request.sequence,
});
instance.lock(request);
return instance;
}
// if all instances have cached state, prefer the one that was used the longest time ago
const availableInstances = Object.values(this.instances).filter((instance) => instance.matchesRequirements(request) && instance.status === 'idle');
if (availableInstances.length > 0) {
const lruInstance = availableInstances.reduce((prev, current) => prev.lastUsed < current.lastUsed ? prev : current);
this.log(LogLevels.debug, 'Reusing least recently used instance', {
instance: lruInstance.id,
sequence: request.sequence,
});
lruInstance.lock(request);
lruInstance.resetContext(); // make sure we reset its cache.
return lruInstance;
}
const requiresGpu = this.config.models[request.model].device?.gpu === true;
if (requiresGpu && this.gpuLock) {
const gpuInstance = Object.values(this.instances).find((instance) => instance.gpu === true);
if (gpuInstance.modelId !== request.model) {
this.log(LogLevels.debug, 'GPU already in use, waiting ...', {
sequence: request.sequence,
});
const instance = await this.acquireGpuInstance(request, signal);
this.log(LogLevels.debug, 'GPU instance acquired', {
instance: instance.id,
sequence: request.sequence,
});
if (signal?.aborted) {
instance.unlock();
throw signal.reason;
}
else {
return instance;
}
}
}
// before starting to wait, make sure sure we're not stuck with an error'd instance (and wait forever)
// currently instances only enter error state if prepareInstance throws an error
const errorInstance = Object.values(this.instances).find((instance) => instance.modelId === request.model && instance.status === 'error');
if (errorInstance) {
throw new Error('Instance is in error state');
}
// wait until an instance of our model is released or spawned
this.log(LogLevels.debug, 'Awaiting idle instance', {
model: request.model,
sequence: request.sequence,
});
const instance = await this.acquireIdleInstance(request, signal);
this.log(LogLevels.debug, 'Idle instance acquired', {
instance: instance.id,
sequence: request.sequence,
});
if (signal?.aborted) {
instance.unlock();
throw signal.reason;
}
else {
return instance;
}
}
createRequestSequence() {
if (this.requestSequence > 999999) {
this.requestSequence = 0;
}
return ++this.requestSequence;
}
// requests a model instance from the pool
async requestInstance(params, signal) {
if (this.shutdownController.signal.aborted) {
throw new Error('Pool is disposed');
}
const requestSequence = this.createRequestSequence();
const request = {
...params,
sequence: requestSequence,
abortController: new AbortController(),
};
if (!this.config.models[request.model]) {
this.log(LogLevels.error, `Model not found: ${request.model}`);
throw new Error(`Model not found: ${request.model}`);
}
this.log(LogLevels.info, 'Incoming request', {
model: request.model,
sequence: request.sequence,
});
this.pendingRequests.add(request);
const abortSignal = mergeAbortSignals([request.abortController.signal, signal]);
abortSignal.addEventListener('abort', () => {
this.log(LogLevels.info, 'Request aborted', {
model: request.model,
sequence: request.sequence,
});
this.pendingRequests.delete(request);
});
const instance = await this.acquireInstance(request, abortSignal);
// once instance is acquired & locked, we can pass it on to the caller
// the queue task promise will be forwarded as releaseInstance
let resolveQueueTask = () => { };
this.queue
.add(() => {
this.pendingRequests.delete(request);
return new Promise((resolve, reject) => {
resolveQueueTask = resolve;
});
})
.then((task) => {
// if there are more requests waiting, prioritize handling them before spawning new instances
// deferred to avoid AbortError when the pool is disposed right after the operation
process.nextTick(() => {
if (!this.pendingRequests.size &&
this.canSpawnInstance(request.model) &&
!this.shutdownController.signal.aborted) {
this.spawnInstance(request.model);
}
});
if (task?.instance) {
this.emit('release', instance);
}
});
// TODO what if user never calls release? automatically resolve or reject after a timeout?
const releaseInstance = () => {
return new Promise((resolve, reject) => {
process.nextTick(() => {
resolveQueueTask({ instance, request });
this.log(LogLevels.info, 'Task completed, releasing', {
instance: instance.id,
sequence: request.sequence,
});
if (instance.config.ttl === 0) {
this.disposeInstance(instance);
}
else {
instance.unlock();
}
resolve();
});
});
};
return {
instance,
release: releaseInstance,
};
}
}
//# sourceMappingURL=pool.js.map